flappa-doormal 2.19.0 → 2.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,1123 +1,4 @@
1
- //#region src/segmentation/tokens.ts
2
- /**
3
- * Arabic base letters used by low-level dictionary-style regex helpers.
4
- *
5
- * This is intentionally broader than `{{harf}}`:
6
- * - includes standalone hamza `ء`
7
- * - stays as a raw regex fragment rather than a template token
8
- */
9
- const ARABIC_BASE_LETTER_CLASS = "[ء-غف-ي]";
10
- /**
11
- * Arabic combining marks / annotation signs used by low-level regex helpers.
12
- */
13
- const ARABIC_MARKS_CLASS = "[\\u0610-\\u061A\\u0640\\u064B-\\u065F\\u0670\\u06D6-\\u06ED]";
14
- /**
15
- * A single Arabic base letter followed by zero or more combining marks.
16
- */
17
- const ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN = `${ARABIC_BASE_LETTER_CLASS}${ARABIC_MARKS_CLASS}*`;
18
- /**
19
- * One or more Arabic letters, where each letter may carry combining marks.
20
- */
21
- const ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN = `(?:${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN})+`;
22
- const ARABIC_SPACED_CODE_ATOM = `[أ-غف-ي]${ARABIC_MARKS_CLASS}*`;
23
- const RUMUZ_ATOM = `(?:${[
24
- "تمييز(?![\\u064B-\\u0652\\u0670أ-ي])",
25
- "خت",
26
- "خغ",
27
- "بخ",
28
- "عخ",
29
- "مق",
30
- "مت",
31
- "عس",
32
- "سي",
33
- "سن",
34
- "كن",
35
- "مد",
36
- "قد",
37
- "خد",
38
- "فد",
39
- "دل",
40
- "كد",
41
- "غد",
42
- "صد",
43
- "دت",
44
- "دس",
45
- "تم",
46
- "فق",
47
- "دق",
48
- "[خرزيمنصسدفلتقع](?![\\u064B-\\u0652\\u0670أ-ي])",
49
- "(?<![\\u0660-\\u0669])٤(?![\\u0660-\\u0669])"
50
- ].join("|")})`;
51
- const RUMUZ_BLOCK = `${RUMUZ_ATOM}(?:\\s+${RUMUZ_ATOM})*`;
52
- const BASE_TOKENS = {
53
- /** Chapter marker (باب). */
54
- bab: "باب",
55
- /** Basmala (بسم الله). Also matches ﷽. */
56
- basmalah: ["بسم الله", "﷽"].join("|"),
57
- /** Bullet point variants: `•`, `*`, `°`. */
58
- bullet: "[•*°]",
59
- /** Dash variants: `-` (U+002D), `–` (U+2013), `—` (U+2014), `ـ` (tatweel U+0640). */
60
- dash: "[-–—ـ]",
61
- /** Section marker (فصل / مسألة). */
62
- fasl: ["مسألة", "فصل"].join("|"),
63
- /** Single Arabic letter (أ-ي). Does NOT include diacritics. */
64
- harf: "[أ-ي]",
65
- /** One or more single Arabic letters separated by spaces, allowing marks/tatweel on each isolated letter (e.g. `د ت س`, `هـ ث`). For multi-letter codes use `{{rumuz}}`. */
66
- harfs: `${ARABIC_SPACED_CODE_ATOM}(?:\\s+${ARABIC_SPACED_CODE_ATOM})*`,
67
- /** Horizontal rule / separator: 5+ repeated dashes, underscores, equals, or tatweels. Mixed allowed. */
68
- hr: "[-–—ـ_=]{5,}",
69
- /** Book marker (كتاب). */
70
- kitab: "كتاب",
71
- /** Hadith transmission phrases (حدثنا, أخبرنا, حدثني, etc.). */
72
- naql: [
73
- "حدثني",
74
- "وأخبرنا",
75
- "حدثنا",
76
- "سمعت",
77
- "أنبأنا",
78
- "وحدثنا",
79
- "أخبرنا",
80
- "وحدثني",
81
- "وحدثنيه"
82
- ].join("|"),
83
- /** Newline character. Useful for breakpoints that split on line boundaries. */
84
- newline: "\\n",
85
- /** Single ASCII digit (0-9). */
86
- num: "\\d",
87
- /** One or more ASCII digits (0-9)+. */
88
- nums: "\\d+",
89
- /** Single Arabic-Indic digit (٠-٩, U+0660-U+0669). */
90
- raqm: "[\\u0660-\\u0669]",
91
- /** One or more Arabic-Indic digits (٠-٩)+. */
92
- raqms: "[\\u0660-\\u0669]+",
93
- /** Rijāl/takhrīj source abbreviations. Matches one or more codes separated by whitespace. */
94
- rumuz: RUMUZ_BLOCK,
95
- /** Arabic/common punctuation: `.`, `!`, `?`, `؟`, `؛`. */
96
- tarqim: "[.!?؟؛]"
97
- };
98
- /** Pre-defined token constants for use in patterns. */
99
- const Token = {
100
- /** Chapter marker - باب */
101
- BAB: "{{bab}}",
102
- /** Basmala - بسم الله */
103
- BASMALAH: "{{basmalah}}",
104
- /** Bullet point variants */
105
- BULLET: "{{bullet}}",
106
- /** Dash variants (hyphen, en-dash, em-dash, tatweel) */
107
- DASH: "{{dash}}",
108
- /** Section marker - فصل / مسألة */
109
- FASL: "{{fasl}}",
110
- /** Single Arabic letter */
111
- HARF: "{{harf}}",
112
- /** Multiple Arabic letters separated by spaces, allowing marks/tatweel on each isolated letter */
113
- HARFS: "{{harfs}}",
114
- /** Horizontal rule / separator (repeated dashes) */
115
- HR: "{{hr}}",
116
- /** Book marker - كتاب */
117
- KITAB: "{{kitab}}",
118
- /** Hadith transmission phrases */
119
- NAQL: "{{naql}}",
120
- /** Newline character (for breakpoints) */
121
- NEWLINE: "{{newline}}",
122
- /** Single ASCII digit */
123
- NUM: "{{num}}",
124
- /** Composite: {{raqms}} {{dash}} (space) */
125
- NUMBERED: "{{numbered}}",
126
- /** One or more ASCII digits */
127
- NUMS: "{{nums}}",
128
- /** Single Arabic-Indic digit */
129
- RAQM: "{{raqm}}",
130
- /** One or more Arabic-Indic digits */
131
- RAQMS: "{{raqms}}",
132
- /** Source abbreviations (rijāl/takhrīj) */
133
- RUMUZ: "{{rumuz}}",
134
- /** Punctuation marks */
135
- TARQIM: "{{tarqim}}"
136
- };
137
- /** Wraps a token constant with a named capture: `{{token}}` → `{{token:name}}`. */
138
- const withCapture = (token, name) => {
139
- const match = token.match(/^\{\{(\w+)\}\}$/);
140
- if (!match) return `{{:${name}}}`;
141
- return `{{${match[1]}:${name}}}`;
142
- };
143
- /** Composite tokens that reference base tokens. Pre-expanded at load time. @internal */
144
- const COMPOSITE_TOKENS = {
145
- /** Common hadith numbering format: Arabic-Indic digits + dash + space. */
146
- numbered: "{{raqms}} {{dash}} " };
147
- /** Expands composite tokens (e.g. `{{numbered}}`) to their underlying template form. */
148
- const expandCompositeTokensInTemplate = (template) => {
149
- let out = template;
150
- for (let i = 0; i < 10; i++) {
151
- const next = out.replace(/\{\{(\w+)\}\}/g, (m, tokenName) => COMPOSITE_TOKENS[tokenName] ?? m);
152
- if (next === out) break;
153
- out = next;
154
- }
155
- return out;
156
- };
157
- /**
158
- * Expands base tokens in a template string.
159
- * Used internally to pre-expand composite tokens.
160
- *
161
- * @param template - Template string with `{{token}}` placeholders
162
- * @returns Expanded pattern with base tokens replaced
163
- * @internal
164
- */
165
- const expandBaseTokens = (template) => template.replace(/\{\{(\w+)\}\}/g, (_, tokenName) => BASE_TOKENS[tokenName] ?? `{{${tokenName}}}`);
166
- /**
167
- * Token definitions mapping human-readable token names to regex patterns.
168
- *
169
- * Tokens are used in template strings with double-brace syntax:
170
- * - `{{token}}` - Expands to the pattern (non-capturing in context)
171
- * - `{{token:name}}` - Expands to a named capture group `(?<name>pattern)`
172
- * - `{{:name}}` - Captures any content with the given name `(?<name>.+)`
173
- *
174
- * @remarks
175
- * These patterns are designed for Arabic text matching. For diacritic-insensitive
176
- * matching of Arabic patterns, use the `fuzzy: true` option in split rules,
177
- * which applies `makeDiacriticInsensitive()` to the expanded patterns.
178
- *
179
- * @example
180
- * // Using tokens in a split rule
181
- * { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
182
- *
183
- * @example
184
- * // Using tokens with named captures
185
- * { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
186
- *
187
- * @example
188
- * // Using the numbered convenience token
189
- * { lineStartsAfter: ['{{numbered}}'], split: 'at' }
190
- */
191
- const TOKEN_PATTERNS = {
192
- ...BASE_TOKENS,
193
- ...Object.fromEntries(Object.entries(COMPOSITE_TOKENS).map(([k, v]) => [k, expandBaseTokens(v)]))
194
- };
195
- /**
196
- * Regex pattern for matching tokens with optional named capture syntax.
197
- *
198
- * Matches:
199
- * - `{{token}}` - Simple token (group 1 = token name, group 2 = empty)
200
- * - `{{token:name}}` - Token with capture (group 1 = token, group 2 = name)
201
- * - `{{:name}}` - Capture-only (group 1 = empty, group 2 = name)
202
- *
203
- * @internal
204
- */
205
- const TOKEN_WITH_CAPTURE_REGEX = /\{\{(\w*):?(\w*)\}\}/g;
206
- /**
207
- * Regex pattern for simple token matching (no capture syntax).
208
- *
209
- * Matches only `{{token}}` format where token is one or more word characters.
210
- * Used by `containsTokens()` for quick detection.
211
- *
212
- * @internal
213
- */
214
- const SIMPLE_TOKEN_REGEX = /\{\{(\w+)\}\}/g;
215
- /**
216
- * Checks if a query string contains template tokens.
217
- *
218
- * Performs a quick test for `{{token}}` patterns without actually
219
- * expanding them. Useful for determining whether to apply token
220
- * expansion to a string.
221
- *
222
- * @param query - String to check for tokens
223
- * @returns `true` if the string contains at least one `{{token}}` pattern
224
- *
225
- * @example
226
- * containsTokens('{{raqms}} {{dash}}') // → true
227
- * containsTokens('plain text') // → false
228
- * containsTokens('[٠-٩]+ - ') // → false (raw regex, no tokens)
229
- */
230
- const containsTokens = (query) => {
231
- SIMPLE_TOKEN_REGEX.lastIndex = 0;
232
- return SIMPLE_TOKEN_REGEX.test(query);
233
- };
234
- const splitTemplateIntoSegments = (query) => {
235
- const segments = [];
236
- let lastIndex = 0;
237
- TOKEN_WITH_CAPTURE_REGEX.lastIndex = 0;
238
- for (const match of query.matchAll(TOKEN_WITH_CAPTURE_REGEX)) {
239
- if (match.index > lastIndex) segments.push({
240
- type: "text",
241
- value: query.slice(lastIndex, match.index)
242
- });
243
- segments.push({
244
- type: "token",
245
- value: match[0]
246
- });
247
- lastIndex = match.index + match[0].length;
248
- }
249
- if (lastIndex < query.length) segments.push({
250
- type: "text",
251
- value: query.slice(lastIndex)
252
- });
253
- return segments;
254
- };
255
- const maybeApplyFuzzyToText = (text, fuzzyTransform) => fuzzyTransform && /[\u0600-\u06FF]/u.test(text) ? fuzzyTransform(text) : text;
256
- const maybeApplyFuzzyToTokenPattern = (tokenPattern, fuzzyTransform) => !fuzzyTransform ? tokenPattern : tokenPattern.split("|").map((part) => /[\u0600-\u06FF]/u.test(part) ? fuzzyTransform(part) : part).join("|");
257
- const parseTokenLiteral = (literal) => {
258
- TOKEN_WITH_CAPTURE_REGEX.lastIndex = 0;
259
- const m = TOKEN_WITH_CAPTURE_REGEX.exec(literal);
260
- return m ? {
261
- captureName: m[2],
262
- tokenName: m[1]
263
- } : null;
264
- };
265
- const createCaptureRegistry = (capturePrefix) => {
266
- const captureNames = [];
267
- const captureNameCounts = /* @__PURE__ */ new Map();
268
- const register = (baseName) => {
269
- const count = captureNameCounts.get(baseName) ?? 0;
270
- captureNameCounts.set(baseName, count + 1);
271
- const uniqueName = count === 0 ? baseName : `${baseName}_${count + 1}`;
272
- const prefixedName = capturePrefix ? `${capturePrefix}${uniqueName}` : uniqueName;
273
- captureNames.push(prefixedName);
274
- return prefixedName;
275
- };
276
- return {
277
- captureNames,
278
- register
279
- };
280
- };
281
- const expandTokenLiteral = (literal, opts) => {
282
- const parsed = parseTokenLiteral(literal);
283
- if (!parsed) return literal;
284
- const { tokenName, captureName } = parsed;
285
- if (!tokenName && captureName) return `(?<${opts.registerCapture(captureName)}>.+)`;
286
- let tokenPattern = TOKEN_PATTERNS[tokenName];
287
- if (!tokenPattern) return literal;
288
- tokenPattern = maybeApplyFuzzyToTokenPattern(tokenPattern, opts.fuzzyTransform);
289
- if (captureName) return `(?<${opts.registerCapture(captureName)}>${tokenPattern})`;
290
- return tokenPattern;
291
- };
292
- /**
293
- * Expands template tokens with support for named captures.
294
- *
295
- * This is the primary token expansion function that handles all token syntax:
296
- * - `{{token}}` → Expands to the token's pattern (no capture group)
297
- * - `{{token:name}}` → Expands to `(?<name>pattern)` (named capture)
298
- * - `{{:name}}` → Expands to `(?<name>.+)` (capture anything)
299
- *
300
- * Unknown tokens are left as-is in the output, allowing for partial templates.
301
- *
302
- * @param query - The template string containing tokens
303
- * @param fuzzyTransform - Optional function to transform Arabic text for fuzzy matching.
304
- * Applied to both token patterns and plain Arabic text between tokens.
305
- * Typically `makeDiacriticInsensitive` from the fuzzy module.
306
- * @returns Object with expanded pattern, capture names, and capture flag
307
- *
308
- * @example
309
- * // Simple token expansion
310
- * expandTokensWithCaptures('{{raqms}} {{dash}}')
311
- * // → { pattern: '[\\u0660-\\u0669]+ [-–—ـ]', captureNames: [], hasCaptures: false }
312
- *
313
- * @example
314
- * // Named capture
315
- * expandTokensWithCaptures('{{raqms:num}} {{dash}}')
316
- * // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
317
- *
318
- * @example
319
- * // Capture-only token
320
- * expandTokensWithCaptures('{{raqms:num}} {{dash}} {{:content}}')
321
- * // → { pattern: '(?<num>[٠-٩]+) [-–—ـ] (?<content>.+)', captureNames: ['num', 'content'], hasCaptures: true }
322
- *
323
- * @example
324
- * // With fuzzy transform
325
- * expandTokensWithCaptures('{{bab}}', makeDiacriticInsensitive)
326
- * // → { pattern: 'بَ?ا?بٌ?', captureNames: [], hasCaptures: false }
327
- */
328
- const expandTokensWithCaptures = (query, fuzzyTransform, capturePrefix) => {
329
- const segments = splitTemplateIntoSegments(query);
330
- const registry = createCaptureRegistry(capturePrefix);
331
- const pattern = segments.map((segment) => segment.type === "text" ? maybeApplyFuzzyToText(segment.value, fuzzyTransform) : expandTokenLiteral(segment.value, {
332
- capturePrefix,
333
- fuzzyTransform,
334
- registerCapture: registry.register
335
- })).join("");
336
- return {
337
- captureNames: registry.captureNames,
338
- hasCaptures: registry.captureNames.length > 0,
339
- pattern
340
- };
341
- };
342
- /**
343
- * Expands template tokens in a query string to their regex equivalents.
344
- *
345
- * This is the simple version without capture support. It returns only the
346
- * expanded pattern string, not capture metadata.
347
- *
348
- * Unknown tokens are left as-is, allowing for partial templates.
349
- *
350
- * @param query - Template string containing `{{token}}` placeholders
351
- * @returns Expanded regex pattern string
352
- *
353
- * @example
354
- * expandTokens('، {{raqms}}') // → '، [\\u0660-\\u0669]+'
355
- * expandTokens('{{raqm}}*') // → '[\\u0660-\\u0669]*'
356
- * expandTokens('{{dash}}{{raqm}}') // → '[-–—ـ][\\u0660-\\u0669]'
357
- * expandTokens('{{unknown}}') // → '{{unknown}}' (left as-is)
358
- *
359
- * @see expandTokensWithCaptures for full capture group support
360
- */
361
- const expandTokens = (query) => expandTokensWithCaptures(query).pattern;
362
- /**
363
- * Converts a template string to a compiled RegExp.
364
- *
365
- * Expands all tokens and attempts to compile the result as a RegExp
366
- * with Unicode flag. Returns `null` if the resulting pattern is invalid.
367
- *
368
- * @remarks
369
- * This function dynamically compiles regular expressions from template strings.
370
- * If templates may come from untrusted sources, be aware of potential ReDoS
371
- * (Regular Expression Denial of Service) risks due to catastrophic backtracking.
372
- * Consider validating pattern complexity or applying execution timeouts when
373
- * running user-submitted patterns.
374
- *
375
- * @param template - Template string containing `{{token}}` placeholders
376
- * @returns Compiled RegExp with 'u' flag, or `null` if invalid
377
- *
378
- * @example
379
- * templateToRegex('، {{raqms}}') // → /، [٠-٩]+/u
380
- * templateToRegex('{{raqms}}+') // → /[٠-٩]++/u (might be invalid in some engines)
381
- * templateToRegex('(((') // → null (invalid regex)
382
- */
383
- const templateToRegex = (template) => {
384
- const expanded = expandTokens(template);
385
- try {
386
- return new RegExp(expanded, "u");
387
- } catch {
388
- return null;
389
- }
390
- };
391
- /**
392
- * Lists all available token names defined in `TOKEN_PATTERNS`.
393
- *
394
- * Useful for documentation, validation, or building user interfaces
395
- * that show available tokens.
396
- *
397
- * @returns Array of token names (e.g., `['bab', 'basmalah', 'bullet', ...]`)
398
- *
399
- * @example
400
- * getAvailableTokens()
401
- * // → ['bab', 'basmalah', 'bullet', 'dash', 'harf', 'kitab', 'naql', 'raqm', 'raqms']
402
- */
403
- const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
404
- /**
405
- * Gets the regex pattern for a specific token name.
406
- *
407
- * Returns the raw pattern string as defined in `TOKEN_PATTERNS`,
408
- * without any expansion or capture group wrapping.
409
- *
410
- * @param tokenName - The token name to look up (e.g., `'raqms'`, `'dash'`, `'harfs'`)
411
- * @returns The regex pattern string for that known token
412
- *
413
- * @example
414
- * getTokenPattern('raqms') // → '[\\u0660-\\u0669]+'
415
- * getTokenPattern('dash') // → '[-–—ـ]'
416
- * getTokenPattern('harfs') // → pattern for spaced isolated Arabic letter codes
417
- */
418
- const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
419
- /**
420
- * Regex to detect fuzzy-default tokens in a pattern string.
421
- * Matches {{token}} or {{token:name}} syntax.
422
- */
423
- const FUZZY_TOKEN_REGEX = new RegExp(`\\{\\{(?:${[
424
- "bab",
425
- "basmalah",
426
- "fasl",
427
- "kitab",
428
- "naql"
429
- ].join("|")})(?::\\w+)?\\}\\}`, "g");
430
- /**
431
- * Checks if a pattern (or array of patterns) contains tokens that should
432
- * default to fuzzy matching.
433
- *
434
- * Fuzzy-default tokens are: bab, basmalah, fasl, kitab, naql
435
- *
436
- * @param patterns - Single pattern string or array of pattern strings
437
- * @returns `true` if any pattern contains a fuzzy-default token
438
- *
439
- * @example
440
- * shouldDefaultToFuzzy('{{bab}} الإيمان') // true
441
- * shouldDefaultToFuzzy('{{raqms}} {{dash}}') // false
442
- * shouldDefaultToFuzzy(['{{kitab}}', '{{raqms}}']) // true
443
- */
444
- const shouldDefaultToFuzzy = (patterns) => {
445
- return (Array.isArray(patterns) ? patterns : [patterns]).some((p) => {
446
- FUZZY_TOKEN_REGEX.lastIndex = 0;
447
- return FUZZY_TOKEN_REGEX.test(p);
448
- });
449
- };
450
- /**
451
- * Apply token mappings to a template string.
452
- *
453
- * Transforms `{{token}}` into `{{token:name}}` based on the provided mappings.
454
- * Useful for applying user-configured capture names to a raw template.
455
- *
456
- * - Only affects exact matches of `{{token}}`.
457
- * - Does NOT affect tokens that already have a capture name (e.g. `{{token:existing}}`).
458
- * - Does NOT affect capture-only tokens (e.g. `{{:name}}`).
459
- *
460
- * @param template - The template string to transform
461
- * @param mappings - Array of mappings from token name to capture name
462
- * @returns Transformed template string with captures applied
463
- *
464
- * @example
465
- * applyTokenMappings('{{raqms}} {{dash}}', [{ token: 'raqms', name: 'num' }])
466
- * // → '{{raqms:num}} {{dash}}'
467
- */
468
- const applyTokenMappings = (template, mappings) => {
469
- let result = template;
470
- for (const { token, name } of mappings) {
471
- if (!token || !name) continue;
472
- const regex = new RegExp(`\\{\\{${token}\\}\\}`, "g");
473
- result = result.replace(regex, `{{${token}:${name}}}`);
474
- }
475
- return result;
476
- };
477
- /**
478
- * Strip token mappings from a template string.
479
- *
480
- * Transforms `{{token:name}}` back into `{{token}}`.
481
- * Also transforms `{{:name}}` patterns (capture-only) into `{{}}` (which is invalid/empty).
482
- *
483
- * Useful for normalizing templates for storage or comparison.
484
- *
485
- * @param template - The template string to strip
486
- * @returns Template string with capture names removed
487
- *
488
- * @example
489
- * stripTokenMappings('{{raqms:num}} {{dash}}')
490
- * // → '{{raqms}} {{dash}}'
491
- */
492
- const stripTokenMappings = (template) => {
493
- return template.replace(/\{\{([^:}]+):[^}]+\}\}/g, "{{$1}}");
494
- };
495
- //#endregion
496
- //#region src/utils/textUtils.ts
497
- /**
498
- * Normalizes line endings to Unix-style (`\n`).
499
- *
500
- * Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
501
- * for consistent pattern matching across platforms.
502
- *
503
- * @param content - Raw content with potentially mixed line endings
504
- * @returns Content with all line endings normalized to `\n`
505
- */
506
- const normalizeLineEndings = (content) => {
507
- return content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
508
- };
509
- /**
510
- * Escapes regex metacharacters (parentheses and brackets) in template patterns,
511
- * but preserves content inside `{{...}}` token delimiters.
512
- *
513
- * This allows users to write intuitive patterns like `({{harf}}):` instead of
514
- * the verbose `\\({{harf}}\\):`. The escaping is applied BEFORE token expansion,
515
- * so tokens like `{{harf}}` which expand to `[أ-ي]` work correctly.
516
- *
517
- * @param pattern - Template pattern that may contain `()[]` and `{{tokens}}`
518
- * @returns Pattern with `()[]` escaped outside of `{{...}}` delimiters
519
- *
520
- * @example
521
- * escapeTemplateBrackets('({{harf}}): ')
522
- * // → '\\({{harf}}\\): '
523
- *
524
- * @example
525
- * escapeTemplateBrackets('[{{raqm}}] ')
526
- * // → '\\[{{raqm}}\\] '
527
- *
528
- * @example
529
- * escapeTemplateBrackets('{{harf}}')
530
- * // → '{{harf}}' (unchanged - no brackets outside tokens)
531
- */
532
- const escapeTemplateBrackets = (pattern) => {
533
- return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (_match, token, bracket) => token || `\\${bracket}`);
534
- };
535
- /**
536
- * Character class matching all Arabic diacritics (Tashkeel/Harakat).
537
- *
538
- * Includes the following diacritical marks:
539
- * - U+0640: ـ (tatweel / kashida)
540
- * - U+064B: ً (fathatan - double fatha)
541
- * - U+064C: ٌ (dammatan - double damma)
542
- * - U+064D: ٍ (kasratan - double kasra)
543
- * - U+064E: َ (fatha - short a)
544
- * - U+064F: ُ (damma - short u)
545
- * - U+0650: ِ (kasra - short i)
546
- * - U+0651: ّ (shadda - gemination)
547
- * - U+0652: ْ (sukun - no vowel)
548
- *
549
- * @internal
550
- */
551
- const DIACRITICS_CLASS = "[ـًٌٍَُِّْ]";
552
- /**
553
- * Groups of equivalent Arabic characters.
554
- *
555
- * Characters within the same group are considered equivalent for matching purposes.
556
- * This handles common variations in Arabic text where different characters are
557
- * used interchangeably or have the same underlying meaning.
558
- *
559
- * Equivalence groups:
560
- * - Alef variants: ا (bare), آ (with madda), أ (with hamza above), إ (with hamza below)
561
- * - Ta marbuta and Ha: ة ↔ ه (often interchangeable at word endings)
562
- * - Alef maqsura and Ya: ى ↔ ي (often interchangeable at word endings)
563
- *
564
- * @internal
565
- */
566
- const EQUIV_GROUPS = [
567
- [
568
- "ا",
569
- "آ",
570
- "أ",
571
- "إ"
572
- ],
573
- ["ة", "ه"],
574
- ["ى", "ي"]
575
- ];
576
- const DIACRITICS_AND_MARKS_REGEX = new RegExp(ARABIC_MARKS_CLASS, "g");
577
- /**
578
- * Escapes a string for safe inclusion in a regular expression.
579
- *
580
- * Escapes all regex metacharacters: `.*+?^${}()|[\]\\`
581
- *
582
- * @param s - Any string to escape
583
- * @returns String with regex metacharacters escaped
584
- *
585
- * @example
586
- * escapeRegex('hello.world') // → 'hello\\.world'
587
- * escapeRegex('[test]') // → '\\[test\\]'
588
- * escapeRegex('a+b*c?') // → 'a\\+b\\*c\\?'
589
- */
590
- const escapeRegex = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
591
- const getEquivClass = (ch) => {
592
- const group = EQUIV_GROUPS.find((g) => g.includes(ch));
593
- return group ? `[${group.map(escapeRegex).join("")}]` : escapeRegex(ch);
594
- };
595
- const normalizeArabicLight = (str) => {
596
- return str.normalize("NFC").replace(/[\u200C\u200D]/g, "").replace(/\s+/g, " ").trim();
597
- };
598
- /**
599
- * Normalizes Arabic text for exact comparisons while tolerating common variants.
600
- *
601
- * This removes Arabic diacritics, collapses whitespace, removes joiners, and
602
- * maps common equivalent letters to a shared canonical form:
603
- * - ا/آ/أ/إ -> ا
604
- * - ة/ه -> ه
605
- * - ى/ي -> ي
606
- */
607
- const normalizeArabicForComparison = (text) => {
608
- return Array.from(normalizeArabicLight(text).replace(DIACRITICS_AND_MARKS_REGEX, "")).map((ch) => {
609
- if (ch === "آ" || ch === "أ" || ch === "إ") return "ا";
610
- if (ch === "ة") return "ه";
611
- if (ch === "ى") return "ي";
612
- return ch;
613
- }).join("");
614
- };
615
- const makeDiacriticInsensitive = (text) => {
616
- const diacriticsMatcher = `${DIACRITICS_CLASS}*`;
617
- return Array.from(normalizeArabicLight(text)).map((ch) => getEquivClass(ch) + diacriticsMatcher).join("");
618
- };
619
- const isCombiningMarkOrSelector = (char) => {
620
- if (!char) return false;
621
- return /\p{M}/u.test(char) || char === "︎" || char === "️";
622
- };
623
- const isJoiner = (char) => char === "‌" || char === "‍";
624
- /**
625
- * Ensures the position does not split a grapheme cluster (surrogate pairs,
626
- * combining marks, or zero-width joiners / variation selectors).
627
- *
628
- * This is only used as a last-resort fallback when we are forced to split
629
- * near a hard limit (e.g. maxContentLength with no safe whitespace/punctuation).
630
- */
631
- const adjustForUnicodeBoundary = (content, position) => {
632
- let adjusted = position;
633
- while (adjusted > 0) {
634
- const high = content.charCodeAt(adjusted - 1);
635
- const low = content.charCodeAt(adjusted);
636
- if (high >= 55296 && high <= 56319 && low >= 56320 && low <= 57343) {
637
- adjusted -= 1;
638
- continue;
639
- }
640
- const nextChar = content[adjusted];
641
- const prevChar = content[adjusted - 1];
642
- if (isCombiningMarkOrSelector(nextChar) || isJoiner(nextChar) || isJoiner(prevChar)) {
643
- adjusted -= 1;
644
- continue;
645
- }
646
- break;
647
- }
648
- return adjusted;
649
- };
650
- //#endregion
651
- //#region src/analysis/shared.ts
652
- const escapeSignatureLiteral = (s) => s.replace(/[.*+?^${}|\\{}]/g, "\\$&");
653
- const TOKEN_PRIORITY_ORDER$1 = [
654
- "basmalah",
655
- "kitab",
656
- "bab",
657
- "fasl",
658
- "naql",
659
- "rumuz",
660
- "numbered",
661
- "raqms",
662
- "raqm",
663
- "dash",
664
- "bullet",
665
- "tarqim"
666
- ];
667
- const buildTokenPriority = () => {
668
- const allTokens = new Set(getAvailableTokens());
669
- return TOKEN_PRIORITY_ORDER$1.filter((t) => allTokens.has(t));
670
- };
671
- const collapseWhitespace = (s) => s.replace(/\s+/g, " ").trim();
672
- const stripArabicDiacritics = (s) => s.replace(/[\u064B-\u065F\u0670\u06D6-\u06ED]/gu, "");
673
- const compileTokenRegexes = (tokenNames) => tokenNames.map((token) => {
674
- const pat = TOKEN_PATTERNS[token];
675
- if (!pat) return null;
676
- try {
677
- return {
678
- re: new RegExp(pat, "uy"),
679
- token
680
- };
681
- } catch {
682
- return null;
683
- }
684
- }).filter((x) => x !== null);
685
- const appendWs = (out, mode) => {
686
- if (!out) return out;
687
- const suffix = mode === "space" ? " " : "\\s*";
688
- return out.endsWith(suffix) ? out : `${out}${suffix}`;
689
- };
690
- const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter) => {
691
- let best = null;
692
- for (const { token, re } of compiled) {
693
- re.lastIndex = pos;
694
- const m = re.exec(s);
695
- if (!m || m.index !== pos) continue;
696
- if (!best || m[0].length > best.text.length) best = {
697
- text: m[0],
698
- token
699
- };
700
- }
701
- if (best?.token === "rumuz") {
702
- const end = pos + best.text.length;
703
- const next = end < s.length ? s[end] : "";
704
- if (next && isArabicLetter(next) && !/\s/u.test(next)) return null;
705
- }
706
- return best;
707
- };
708
- const isArabicLetter = (ch) => /\p{Script=Arabic}/u.test(ch) && /\p{L}/u.test(ch);
709
- const isCommonDelimiter = (ch) => /[::\-–—ـ،؛.?!؟()[\]{}]/u.test(ch);
710
- //#endregion
711
- //#region src/analysis/line-starts.ts
712
- const resolveOptions$1 = (options = {}) => ({
713
- includeFirstWordFallback: options.includeFirstWordFallback ?? true,
714
- lineFilter: options.lineFilter,
715
- maxExamples: options.maxExamples ?? 1,
716
- minCount: options.minCount ?? 3,
717
- minLineLength: options.minLineLength ?? 6,
718
- normalizeArabicDiacritics: options.normalizeArabicDiacritics ?? true,
719
- prefixChars: options.prefixChars ?? 60,
720
- prefixMatchers: options.prefixMatchers ?? [/^#+/u],
721
- sortBy: options.sortBy ?? "specificity",
722
- topK: options.topK ?? 40,
723
- whitespace: options.whitespace ?? "regex"
724
- });
725
- const countTokenMarkers = (pattern) => (pattern.match(/\{\{/g) ?? []).length;
726
- const computeSpecificity = (pattern) => ({
727
- literalLen: pattern.replace(/\\s\*/g, "").replace(/[ \t]+/g, "").length,
728
- tokenCount: countTokenMarkers(pattern)
729
- });
730
- const compareBySpecificity = (a, b) => {
731
- const sa = computeSpecificity(a.pattern), sb = computeSpecificity(b.pattern);
732
- return sb.tokenCount - sa.tokenCount || sb.literalLen - sa.literalLen || b.count - a.count || a.pattern.localeCompare(b.pattern);
733
- };
734
- const compareByCount = (a, b) => b.count !== a.count ? b.count - a.count : compareBySpecificity(a, b);
735
- const appendPrefix = (s, pos, out, matchers, ws) => {
736
- for (const re of matchers) {
737
- if (pos >= s.length) break;
738
- const m = re.exec(s.slice(pos));
739
- if (!m?.index && m?.[0]) {
740
- out += escapeSignatureLiteral(m[0]);
741
- pos += m[0].length;
742
- const wsm = /^[ \t]+/u.exec(s.slice(pos));
743
- if (wsm) {
744
- pos += wsm[0].length;
745
- out = appendWs(out, ws);
746
- }
747
- return {
748
- matched: true,
749
- out,
750
- pos
751
- };
752
- }
753
- }
754
- return {
755
- matched: false,
756
- out,
757
- pos
758
- };
759
- };
760
- const appendToken = (s, pos, out, compiled) => {
761
- const best = findBestTokenMatchAt(s, pos, compiled, isArabicLetter);
762
- return best ? {
763
- matched: true,
764
- out: `${out}{{${best.token}}}`,
765
- pos: pos + best.text.length
766
- } : {
767
- matched: false,
768
- out,
769
- pos
770
- };
771
- };
772
- const appendDelimiter = (s, pos, out) => {
773
- const ch = s[pos];
774
- return ch && isCommonDelimiter(ch) ? {
775
- matched: true,
776
- out: `${out}${escapeSignatureLiteral(ch)}`,
777
- pos: pos + 1
778
- } : {
779
- matched: false,
780
- out,
781
- pos
782
- };
783
- };
784
- const appendFallbackWord = (s, pos, out) => {
785
- const word = extractFirstWord(s.slice(pos));
786
- return word ? `${out}${escapeSignatureLiteral(word)}` : null;
787
- };
788
- const consumeLineStartStep = (s, pos, out, compiled, opts, matchedAny, matchedToken) => {
789
- const ws = skipWhitespace$1(s, pos, out, opts.whitespace);
790
- if (ws.skipped) return {
791
- done: false,
792
- matchedAny,
793
- matchedToken,
794
- out: ws.out,
795
- pos: ws.pos,
796
- steps: 0
797
- };
798
- const tok = appendToken(s, pos, out, compiled);
799
- if (tok.matched) return {
800
- done: false,
801
- matchedAny: true,
802
- matchedToken: true,
803
- out: tok.out,
804
- pos: tok.pos,
805
- steps: 1
806
- };
807
- if (matchedAny) {
808
- const delim = appendDelimiter(s, pos, out);
809
- if (delim.matched) return {
810
- done: false,
811
- matchedAny,
812
- matchedToken,
813
- out: delim.out,
814
- pos: delim.pos,
815
- steps: 0
816
- };
817
- if (opts.includeFirstWordFallback && !matchedToken) {
818
- const fallback = appendFallbackWord(s, pos, out);
819
- if (fallback) return {
820
- done: true,
821
- matchedAny,
822
- matchedToken,
823
- out: fallback,
824
- pos,
825
- steps: 1
826
- };
827
- }
828
- return {
829
- done: true,
830
- matchedAny,
831
- matchedToken,
832
- out,
833
- pos,
834
- steps: 0
835
- };
836
- }
837
- if (!opts.includeFirstWordFallback) return {
838
- done: true,
839
- matchedAny,
840
- matchedToken,
841
- out,
842
- pos,
843
- steps: 0
844
- };
845
- const fallback = appendFallbackWord(s, pos, out);
846
- return fallback ? {
847
- done: true,
848
- matchedAny: true,
849
- matchedToken,
850
- out: fallback,
851
- pos,
852
- steps: 0
853
- } : {
854
- done: true,
855
- matchedAny,
856
- matchedToken,
857
- out,
858
- pos,
859
- steps: 0
860
- };
861
- };
862
- /** Remove trailing whitespace placeholders */
863
- const trimTrailingWs = (out, mode) => {
864
- const suffix = mode === "regex" ? "\\s*" : " ";
865
- while (out.endsWith(suffix)) out = out.slice(0, -suffix.length);
866
- return out;
867
- };
868
- /** Try to extract first word for fallback */
869
- const extractFirstWord = (s) => (s.match(/^[^\s:،؛.?!؟]+/u) ?? [])[0] ?? null;
870
- /** Skip whitespace at position */
871
- const skipWhitespace$1 = (s, pos, out, ws) => {
872
- const m = /^[ \t]+/u.exec(s.slice(pos));
873
- if (!m) return {
874
- out,
875
- pos,
876
- skipped: false
877
- };
878
- return {
879
- out: appendWs(out, ws),
880
- pos: pos + m[0].length,
881
- skipped: true
882
- };
883
- };
884
- const tokenizeLineStart = (line, tokenNames, opts) => {
885
- const trimmed = collapseWhitespace(line);
886
- if (!trimmed) return null;
887
- const s = (opts.normalizeArabicDiacritics ? stripArabicDiacritics(trimmed) : trimmed).slice(0, opts.prefixChars);
888
- const compiled = compileTokenRegexes(tokenNames);
889
- let pos = 0, out = "", matchedAny = false, matchedToken = false, steps = 0;
890
- const prefix = appendPrefix(s, pos, out, opts.prefixMatchers, opts.whitespace);
891
- pos = prefix.pos;
892
- out = prefix.out;
893
- matchedAny = prefix.matched;
894
- while (steps < 6 && pos < s.length) {
895
- const next = consumeLineStartStep(s, pos, out, compiled, opts, matchedAny, matchedToken);
896
- if (next.done) {
897
- if (!next.matchedAny && !next.matchedToken && next.out === out && next.pos === pos) return null;
898
- if (next.steps > 0) steps += next.steps;
899
- matchedAny = next.matchedAny;
900
- matchedToken = next.matchedToken;
901
- out = next.out;
902
- break;
903
- }
904
- pos = next.pos;
905
- out = next.out;
906
- matchedAny = next.matchedAny;
907
- matchedToken = next.matchedToken;
908
- steps += next.steps;
909
- }
910
- return matchedAny ? trimTrailingWs(out, opts.whitespace) : null;
911
- };
912
- const processLine = (line, pageId, tokenPriority, opts, acc) => {
913
- const trimmed = collapseWhitespace(line);
914
- if (trimmed.length < opts.minLineLength) return;
915
- if (opts.lineFilter && !opts.lineFilter(trimmed, pageId)) return;
916
- const sig = tokenizeLineStart(trimmed, tokenPriority, opts);
917
- if (!sig) return;
918
- const entry = acc.get(sig);
919
- if (!entry) acc.set(sig, {
920
- count: 1,
921
- examples: [{
922
- line: trimmed,
923
- pageId
924
- }]
925
- });
926
- else {
927
- entry.count++;
928
- if (entry.examples.length < opts.maxExamples) entry.examples.push({
929
- line: trimmed,
930
- pageId
931
- });
932
- }
933
- };
934
- const processPage = (page, tokenPriority, opts, acc) => {
935
- for (const line of normalizeLineEndings(page.content ?? "").split("\n")) processLine(line, page.id, tokenPriority, opts, acc);
936
- };
937
- /**
938
- * Analyze pages and return the most common line-start patterns (top K).
939
- */
940
- const analyzeCommonLineStarts = (pages, options = {}) => {
941
- const opts = resolveOptions$1(options);
942
- const tokenPriority = buildTokenPriority();
943
- const acc = /* @__PURE__ */ new Map();
944
- for (const page of pages) processPage(page, tokenPriority, opts, acc);
945
- const comparator = opts.sortBy === "count" ? compareByCount : compareBySpecificity;
946
- return [...acc.entries()].map(([pattern, v]) => ({
947
- count: v.count,
948
- examples: v.examples,
949
- pattern
950
- })).filter((p) => p.count >= opts.minCount).sort(comparator).slice(0, opts.topK);
951
- };
952
- //#endregion
953
- //#region src/analysis/repeating-sequences.ts
954
- const resolveOptions = (options) => {
955
- const minElements = Math.max(1, options?.minElements ?? 1);
956
- return {
957
- contextChars: options?.contextChars ?? 50,
958
- maxElements: Math.max(minElements, options?.maxElements ?? 3),
959
- maxExamples: options?.maxExamples ?? 3,
960
- maxUniquePatterns: options?.maxUniquePatterns ?? 1e3,
961
- minCount: Math.max(1, options?.minCount ?? 3),
962
- minElements,
963
- normalizeArabicDiacritics: options?.normalizeArabicDiacritics ?? true,
964
- requireToken: options?.requireToken ?? true,
965
- topK: Math.max(1, options?.topK ?? 20),
966
- whitespace: options?.whitespace ?? "regex"
967
- };
968
- };
969
- /** Creates a cursor that tracks position in both normalized and raw text */
970
- const createRawCursor = (text, normalize) => {
971
- let rawPos = 0;
972
- return {
973
- /** Advance cursor, returning the raw text chunk consumed */
974
- advance(normalizedLen) {
975
- if (!normalize) {
976
- const chunk = text.slice(rawPos, rawPos + normalizedLen);
977
- rawPos += normalizedLen;
978
- return chunk;
979
- }
980
- const start = rawPos;
981
- let matchedLen = 0;
982
- while (matchedLen < normalizedLen && rawPos < text.length) {
983
- if (stripArabicDiacritics(text[rawPos]).length > 0) matchedLen++;
984
- rawPos++;
985
- }
986
- while (rawPos < text.length && stripArabicDiacritics(text[rawPos]).length === 0) rawPos++;
987
- return text.slice(start, rawPos);
988
- },
989
- get pos() {
990
- return rawPos;
991
- }
992
- };
993
- };
994
- /** Scans text and produces a stream of tokens and literals. */
995
- const tokenizeContent = (text, normalize) => {
996
- const normalized = normalize ? stripArabicDiacritics(text) : text;
997
- const compiled = compileTokenRegexes(buildTokenPriority());
998
- const cursor = createRawCursor(text, normalize);
999
- const items = [];
1000
- let pos = 0;
1001
- while (pos < normalized.length) {
1002
- const ws = /^\s+/u.exec(normalized.slice(pos));
1003
- if (ws) {
1004
- pos += ws[0].length;
1005
- cursor.advance(ws[0].length);
1006
- continue;
1007
- }
1008
- const token = findBestTokenMatchAt(normalized, pos, compiled, isArabicLetter);
1009
- if (token) {
1010
- const raw = cursor.advance(token.text.length);
1011
- items.push({
1012
- end: cursor.pos,
1013
- raw,
1014
- start: cursor.pos - raw.length,
1015
- text: `{{${token.token}}}`,
1016
- type: "token"
1017
- });
1018
- pos += token.text.length;
1019
- continue;
1020
- }
1021
- if (isCommonDelimiter(normalized[pos])) {
1022
- const raw = cursor.advance(1);
1023
- items.push({
1024
- end: cursor.pos,
1025
- raw,
1026
- start: cursor.pos - 1,
1027
- text: escapeSignatureLiteral(normalized[pos]),
1028
- type: "literal"
1029
- });
1030
- pos++;
1031
- continue;
1032
- }
1033
- const word = /^[^\s::\-–—ـ،؛.?!؟()[\]{}]+/u.exec(normalized.slice(pos));
1034
- if (word) {
1035
- const raw = cursor.advance(word[0].length);
1036
- items.push({
1037
- end: cursor.pos,
1038
- raw,
1039
- start: cursor.pos - raw.length,
1040
- text: escapeSignatureLiteral(word[0]),
1041
- type: "literal"
1042
- });
1043
- pos += word[0].length;
1044
- continue;
1045
- }
1046
- cursor.advance(1);
1047
- pos++;
1048
- }
1049
- return items;
1050
- };
1051
- /** Build pattern string from window items */
1052
- const buildPattern = (window, whitespace) => window.map((i) => i.text).join(whitespace === "space" ? " " : "\\s*");
1053
- /** Check if window contains at least one token */
1054
- const hasTokenInWindow = (window) => window.some((i) => i.type === "token");
1055
- /** Compute token count and literal length for a window */
1056
- const computeWindowStats = (window) => {
1057
- let tokenCount = 0, literalLen = 0;
1058
- for (const item of window) if (item.type === "token") tokenCount++;
1059
- else literalLen += item.text.length;
1060
- return {
1061
- literalLen,
1062
- tokenCount
1063
- };
1064
- };
1065
- /** Build example from page content and window */
1066
- const buildExample = (page, window, contextChars) => {
1067
- const start = window[0].start;
1068
- const end = window.at(-1).end;
1069
- const ctxStart = Math.max(0, start - contextChars);
1070
- const ctxEnd = Math.min(page.content.length, end + contextChars);
1071
- return {
1072
- context: (ctxStart > 0 ? "..." : "") + page.content.slice(ctxStart, ctxEnd) + (ctxEnd < page.content.length ? "..." : ""),
1073
- pageId: page.id,
1074
- startIndices: window.map((w) => w.start),
1075
- text: page.content.slice(start, end)
1076
- };
1077
- };
1078
- const recordPattern = (page, window, opts, stats) => {
1079
- if (opts.requireToken && !hasTokenInWindow(window)) return;
1080
- const pattern = buildPattern(window, opts.whitespace);
1081
- let entry = stats.get(pattern);
1082
- if (!entry) {
1083
- if (stats.size >= opts.maxUniquePatterns) return;
1084
- entry = {
1085
- count: 0,
1086
- examples: [],
1087
- ...computeWindowStats(window)
1088
- };
1089
- stats.set(pattern, entry);
1090
- }
1091
- entry.count++;
1092
- if (entry.examples.length < opts.maxExamples) entry.examples.push(buildExample(page, window, opts.contextChars));
1093
- };
1094
- /** Extract N-grams from a single page */
1095
- const extractPageNgrams = (page, items, opts, stats) => {
1096
- for (let i = 0; i <= items.length - opts.minElements; i++) {
1097
- const maxWindowSize = Math.min(opts.maxElements, items.length - i);
1098
- for (let n = opts.minElements; n <= maxWindowSize; n++) recordPattern(page, items.slice(i, i + n), opts, stats);
1099
- }
1100
- };
1101
- /**
1102
- * Analyze pages for commonly repeating word sequences.
1103
- *
1104
- * Use for continuous text without line breaks. For line-based analysis,
1105
- * use `analyzeCommonLineStarts()` instead.
1106
- */
1107
- const analyzeRepeatingSequences = (pages, options) => {
1108
- const opts = resolveOptions(options);
1109
- const stats = /* @__PURE__ */ new Map();
1110
- for (const page of pages) {
1111
- if (!page.content) continue;
1112
- extractPageNgrams(page, tokenizeContent(page.content, opts.normalizeArabicDiacritics), opts, stats);
1113
- }
1114
- return [...stats.entries()].filter(([, s]) => s.count >= opts.minCount).sort((a, b) => b[1].count - a[1].count || b[1].tokenCount - a[1].tokenCount || b[1].literalLen - a[1].literalLen).slice(0, opts.topK).map(([pattern, s]) => ({
1115
- count: s.count,
1116
- examples: s.examples,
1117
- pattern
1118
- }));
1119
- };
1120
- //#endregion
1
+ import { A as ARABIC_MARKS_CLASS, B as getTokenPattern, C as analyzeCommonLineStarts, D as normalizeArabicForComparison, E as makeDiacriticInsensitive, F as containsTokens, H as stripTokenMappings, I as expandCompositeTokensInTemplate, L as expandTokens, M as TOKEN_PATTERNS, N as Token, O as ARABIC_BASE_LETTER_CLASS, P as applyTokenMappings, R as expandTokensWithCaptures, S as analyzeRepeatingSequences, T as escapeTemplateBrackets, U as templateToRegex, V as shouldDefaultToFuzzy, W as withCapture, _ as removeZeroWidth, a as diagnoseDictionaryProfile, b as optimizeRules, c as analyzeDictionaryMarkdownPages, d as getDebugReason, f as getSegmentDebugReason, g as fixTrailingWaw, h as condenseEllipsis, i as escapeWordsOutsideTokens, j as ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, k as ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, l as classifyDictionaryHeading, m as applyPreprocessToPage, n as segmentPages, o as DictionaryProfileValidationError, p as validateSegments, r as createArabicDictionaryEntryRule, s as validateDictionaryProfile, t as suggestSegmentationOptions, u as scanDictionaryMarkdownPage, v as formatValidationReport, w as escapeRegex, x as PATTERN_TYPE_KEYS, y as validateRules, z as getAvailableTokens } from "./segmentation-advisor-D375TL8-.mjs";
1121
2
  //#region src/detection.ts
1122
3
  /**
1123
4
  * Token detection order - more specific patterns first to avoid partial matches.
@@ -1279,3330 +160,6 @@ const analyzeTextForRule = (text) => {
1279
160
  };
1280
161
  };
1281
162
  //#endregion
1282
- //#region src/types/rules.ts
1283
- /**
1284
- * Pattern type key names for split rules.
1285
- *
1286
- * Use this array to dynamically iterate over pattern types in UIs,
1287
- * or use the `PatternTypeKey` type for type-safe string unions.
1288
- *
1289
- * @example
1290
- * // Build a dropdown/select in UI
1291
- * PATTERN_TYPE_KEYS.map(key => <option value={key}>{key}</option>)
1292
- *
1293
- * @example
1294
- * // Type-safe pattern key validation
1295
- * const validateKey = (k: string): k is PatternTypeKey =>
1296
- * (PATTERN_TYPE_KEYS as readonly string[]).includes(k);
1297
- */
1298
- const PATTERN_TYPE_KEYS = [
1299
- "lineStartsWith",
1300
- "lineStartsAfter",
1301
- "lineEndsWith",
1302
- "template",
1303
- "regex",
1304
- "dictionaryEntry"
1305
- ];
1306
- //#endregion
1307
- //#region src/optimization/optimize-rules.ts
1308
- const MERGEABLE_KEYS = new Set([
1309
- "lineStartsWith",
1310
- "lineStartsAfter",
1311
- "lineEndsWith"
1312
- ]);
1313
- /**
1314
- * Get the pattern type key for a rule.
1315
- */
1316
- const getPatternKey = (rule) => PATTERN_TYPE_KEYS.find((key) => key in rule) ?? "regex";
1317
- const getPatternArray = (rule, key) => {
1318
- const value = rule[key];
1319
- return Array.isArray(value) ? value : [];
1320
- };
1321
- const getPatternString = (rule, key) => {
1322
- const value = rule[key];
1323
- return typeof value === "string" ? value : Array.isArray(value) ? value.join("\n") : value ? JSON.stringify(value) : "";
1324
- };
1325
- const normalizePatterns = (patterns) => [...new Set(patterns)].sort((a, b) => b.length - a.length || a.localeCompare(b));
1326
- const getDictionaryEntrySpecificityScore = (rule) => {
1327
- if (!("dictionaryEntry" in rule)) return 0;
1328
- const { allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, maxLetters = 10, midLineSubentries = true, minLetters = 2, stopWords } = rule.dictionaryEntry;
1329
- return minLetters * 20 + maxLetters + (allowCommaSeparated ? 0 : 120) + (allowParenthesized ? 0 : 60) + (allowWhitespaceBeforeColon ? 0 : 20) + (midLineSubentries ? 0 : 160) + Math.min(stopWords.length, 25);
1330
- };
1331
- const getSpecificityScore = (rule) => {
1332
- const key = getPatternKey(rule);
1333
- if (key === "dictionaryEntry") return getDictionaryEntrySpecificityScore(rule);
1334
- return MERGEABLE_KEYS.has(key) ? getPatternArray(rule, key).reduce((max, p) => Math.max(max, p.length), 0) : getPatternString(rule, key).length;
1335
- };
1336
- const createMergeKey = (rule) => {
1337
- const key = getPatternKey(rule);
1338
- const { [key]: _, ...rest } = rule;
1339
- return `${key}|${JSON.stringify(rest)}`;
1340
- };
1341
- const optimizeRules = (rules) => {
1342
- const output = [];
1343
- const indexByMergeKey = /* @__PURE__ */ new Map();
1344
- let mergedCount = 0;
1345
- for (const rule of rules) {
1346
- const key = getPatternKey(rule);
1347
- if (!MERGEABLE_KEYS.has(key)) {
1348
- output.push(rule);
1349
- continue;
1350
- }
1351
- const mergeKey = createMergeKey(rule);
1352
- const existingIndex = indexByMergeKey.get(mergeKey);
1353
- if (existingIndex === void 0) {
1354
- indexByMergeKey.set(mergeKey, output.length);
1355
- output.push({
1356
- ...rule,
1357
- [key]: normalizePatterns(getPatternArray(rule, key))
1358
- });
1359
- } else {
1360
- const existing = output[existingIndex];
1361
- existing[key] = normalizePatterns([...getPatternArray(existing, key), ...getPatternArray(rule, key)]);
1362
- mergedCount++;
1363
- }
1364
- }
1365
- return {
1366
- mergedCount,
1367
- rules: output.sort((a, b) => getSpecificityScore(b) - getSpecificityScore(a))
1368
- };
1369
- };
1370
- //#endregion
1371
- //#region src/preprocessing/transforms.ts
1372
- /** Helper for exhaustive switch checking - TypeScript will error if a case is missed */
1373
- const assertNever = (x) => {
1374
- throw new Error(`Unknown preprocess transform type: ${JSON.stringify(x)}`);
1375
- };
1376
- /** Check if a character is whitespace (space, newline, tab, etc.) */
1377
- const isWhitespace = (char) => /\s/.test(char);
1378
- /**
1379
- * Check if a character code is a zero-width control character.
1380
- *
1381
- * Covers:
1382
- * - U+200B–U+200F (Zero Width Space, Joiners, Direction Marks)
1383
- * - U+202A–U+202E (Bidirectional Formatting)
1384
- * - U+2060–U+2064 (Word Joiner, Invisible Operators)
1385
- * - U+FEFF (Byte Order Mark / Zero Width No-Break Space)
1386
- */
1387
- const isZeroWidth = (code) => code >= 8203 && code <= 8207 || code >= 8234 && code <= 8238 || code >= 8288 && code <= 8292 || code === 65279;
1388
- /**
1389
- * Remove zero-width control characters from text.
1390
- *
1391
- * @param text - Input text
1392
- * @param mode - 'strip' (default) removes entirely, 'space' replaces with space
1393
- * @returns Text with zero-width characters removed or replaced
1394
- */
1395
- const removeZeroWidth = (text, mode = "strip") => {
1396
- if (mode === "space") {
1397
- const parts = [];
1398
- let lastWasWhitespace = true;
1399
- for (let i = 0; i < text.length; i++) if (isZeroWidth(text.charCodeAt(i))) {
1400
- if (!lastWasWhitespace && parts.length > 0) {
1401
- parts.push(" ");
1402
- lastWasWhitespace = true;
1403
- }
1404
- } else {
1405
- const char = text[i];
1406
- parts.push(char);
1407
- lastWasWhitespace = isWhitespace(char);
1408
- }
1409
- return parts.join("");
1410
- }
1411
- return text.replace(/[\u200B-\u200F\u202A-\u202E\u2060-\u2064\uFEFF]/g, "");
1412
- };
1413
- /**
1414
- * Condense multiple periods (...) into ellipsis character (…).
1415
- *
1416
- * Prevents `{{tarqim}}` from false-matching inside ellipsis since
1417
- * the `.` in tarqim matches individual periods.
1418
- *
1419
- * @param text - Input text
1420
- * @returns Text with period sequences replaced by ellipsis
1421
- */
1422
- const condenseEllipsis = (text) => text.replace(/\.{2,}/g, "…");
1423
- /**
1424
- * Join trailing و (waw) to the next word.
1425
- *
1426
- * Fixes OCR/digitization artifacts: ' و ' → ' و' (waw joined to next word)
1427
- *
1428
- * @param text - Input text
1429
- * @returns Text with trailing waw joined to following word
1430
- */
1431
- const fixTrailingWaw = (text) => text.replace(/ و /g, " و");
1432
- /**
1433
- * Check if a page ID is within a constraint range.
1434
- */
1435
- const isInRange = (pageId, constraint) => {
1436
- if (constraint.min !== void 0 && pageId < constraint.min) return false;
1437
- if (constraint.max !== void 0 && pageId > constraint.max) return false;
1438
- return true;
1439
- };
1440
- /**
1441
- * Normalize a transform to its object form.
1442
- */
1443
- const normalizeTransform = (transform) => {
1444
- if (typeof transform === "string") return { type: transform };
1445
- return transform;
1446
- };
1447
- /**
1448
- * Apply preprocessing transforms to a page's content.
1449
- *
1450
- * Transforms run in array order. Each can be limited to specific pages
1451
- * via `min`/`max` constraints.
1452
- *
1453
- * @param content - Page content to transform
1454
- * @param pageId - Page ID for constraint checking
1455
- * @param transforms - Array of transforms to apply
1456
- * @returns Transformed content
1457
- */
1458
- const applyPreprocessToPage = (content, pageId, transforms) => {
1459
- let result = content;
1460
- for (const transform of transforms) {
1461
- const rule = normalizeTransform(transform);
1462
- if (!isInRange(pageId, rule)) continue;
1463
- switch (rule.type) {
1464
- case "removeZeroWidth":
1465
- result = removeZeroWidth(result, rule.mode ?? "strip");
1466
- break;
1467
- case "condenseEllipsis":
1468
- result = condenseEllipsis(result);
1469
- break;
1470
- case "fixTrailingWaw":
1471
- result = fixTrailingWaw(result);
1472
- break;
1473
- default: assertNever(rule.type);
1474
- }
1475
- }
1476
- return result;
1477
- };
1478
- //#endregion
1479
- //#region src/segmentation/arabic-dictionary-rule.ts
1480
- const uniqueCanonicalWords = (words) => {
1481
- const seen = /* @__PURE__ */ new Set();
1482
- const result = [];
1483
- for (const word of words) {
1484
- const normalized = normalizeArabicForComparison(word);
1485
- if (!normalized || seen.has(normalized)) continue;
1486
- seen.add(normalized);
1487
- result.push(word);
1488
- }
1489
- return result;
1490
- };
1491
- const buildStopAlternation = (stopWords) => {
1492
- const unique = uniqueCanonicalWords(stopWords);
1493
- if (unique.length === 0) return "";
1494
- return unique.map((word) => makeDiacriticInsensitive(normalizeArabicForComparison(word))).join("|");
1495
- };
1496
- const buildHeadwordBody = ({ allowCommaSeparated, colonPattern, stopAlternation, stopwordBody, unit }) => {
1497
- if (!stopAlternation) return allowCommaSeparated ? `${unit}(?:\\s*[،,]\\s*${unit})*` : unit;
1498
- const guardedUnit = `(?!(?:${stopwordBody})${allowCommaSeparated ? `(?:\\s*[،,]\\s*|${colonPattern})` : colonPattern})${unit}`;
1499
- return allowCommaSeparated ? `${guardedUnit}(?:\\s*[،,]\\s*${guardedUnit})*` : guardedUnit;
1500
- };
1501
- const buildBalancedMarker = ({ allowParenthesized, allowWhitespaceBeforeColon, captureName, headwordBody }) => {
1502
- const colon = allowWhitespaceBeforeColon ? "\\s*:" : ":";
1503
- const withCapture = `(?<${captureName}>${headwordBody})`;
1504
- if (!allowParenthesized) return `${withCapture}${colon}`;
1505
- return `(?:\\(\\s*${withCapture}\\s*\\)|${withCapture})${colon}`;
1506
- };
1507
- const validateDictionaryEntryOptions = ({ captureName = "lemma", maxLetters = 10, minLetters = 2 }) => {
1508
- if (!Number.isInteger(minLetters) || minLetters < 1) throw new Error(`createArabicDictionaryEntryRule: minLetters must be an integer >= 1, got ${minLetters}`);
1509
- if (!Number.isInteger(maxLetters) || maxLetters < minLetters) throw new Error(`createArabicDictionaryEntryRule: maxLetters must be an integer >= minLetters, got ${maxLetters}`);
1510
- if (!captureName.match(/^[A-Za-z_]\w*$/)) throw new Error(`createArabicDictionaryEntryRule: invalid captureName "${captureName}"`);
1511
- };
1512
- const buildArabicDictionaryEntryRegexSource = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, midLineSubentries = true, minLetters = 2, stopWords }, capturePrefix) => {
1513
- validateDictionaryEntryOptions({
1514
- captureName,
1515
- maxLetters,
1516
- minLetters
1517
- });
1518
- const zeroWidthPrefix = "[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*";
1519
- const wawWithMarks = `و${ARABIC_MARKS_CLASS}*`;
1520
- const alWithMarks = `ا${ARABIC_MARKS_CLASS}*ل${ARABIC_MARKS_CLASS}*`;
1521
- const lemmaUnit = `(?:${wawWithMarks})?(?:${alWithMarks})?${`${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}(?:${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}){${minLetters - 1},${maxLetters - 1}}`}`;
1522
- const stopAlternation = buildStopAlternation(stopWords);
1523
- const lemmaBody = buildHeadwordBody({
1524
- allowCommaSeparated,
1525
- colonPattern: allowWhitespaceBeforeColon ? "\\s*:" : ":",
1526
- stopAlternation,
1527
- stopwordBody: stopAlternation ? `(?:${wawWithMarks})?(?:${stopAlternation})` : "",
1528
- unit: lemmaUnit
1529
- });
1530
- const lineStartBoundary = `(?:(?<=^)|(?<=\\n))${zeroWidthPrefix}`;
1531
- const midLineTrigger = allowParenthesized ? `(?<=\\s)(?=(?:\\(\\s*)?${wawWithMarks}(?:${alWithMarks})?)` : `(?<=\\s)(?=${wawWithMarks}(?:${alWithMarks})?)`;
1532
- const prefixedCaptureName = capturePrefix ? `${capturePrefix}${captureName}` : captureName;
1533
- const regex = `(?:${lineStartBoundary}${midLineSubentries ? `|${midLineTrigger}` : ""})` + buildBalancedMarker({
1534
- allowParenthesized,
1535
- allowWhitespaceBeforeColon,
1536
- captureName: prefixedCaptureName,
1537
- headwordBody: lemmaBody
1538
- });
1539
- return {
1540
- captureNames: [prefixedCaptureName],
1541
- regex
1542
- };
1543
- };
1544
- /**
1545
- * Creates a reusable split rule for Arabic dictionary entries.
1546
- *
1547
- * The returned rule preserves authoring intent as a serializable
1548
- * `{ dictionaryEntry: ... }` pattern rather than eagerly compiling to a raw
1549
- * regex string.
1550
- *
1551
- * @example
1552
- * createArabicDictionaryEntryRule({
1553
- * stopWords: ['وقيل', 'ويقال', 'قال'],
1554
- * pageStartPrevWordStoplist: ['قال', 'وقيل', 'ويقال'],
1555
- * })
1556
- *
1557
- * @example
1558
- * createArabicDictionaryEntryRule({
1559
- * allowParenthesized: true,
1560
- * allowWhitespaceBeforeColon: true,
1561
- * allowCommaSeparated: true,
1562
- * stopWords: ['الليث', 'العجاج'],
1563
- * })
1564
- */
1565
- const createArabicDictionaryEntryRule = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, meta, midLineSubentries = true, minLetters = 2, pageStartPrevWordStoplist, samePagePrevWordStoplist, stopWords }) => {
1566
- validateDictionaryEntryOptions({
1567
- captureName,
1568
- maxLetters,
1569
- minLetters
1570
- });
1571
- return {
1572
- dictionaryEntry: {
1573
- allowCommaSeparated,
1574
- allowParenthesized,
1575
- allowWhitespaceBeforeColon,
1576
- captureName,
1577
- maxLetters,
1578
- midLineSubentries,
1579
- minLetters,
1580
- stopWords: uniqueCanonicalWords(stopWords)
1581
- },
1582
- meta,
1583
- pageStartPrevWordStoplist,
1584
- samePagePrevWordStoplist
1585
- };
1586
- };
1587
- const WINDOW_PREFIX_LENGTHS = [
1588
- 80,
1589
- 60,
1590
- 40,
1591
- 30,
1592
- 20,
1593
- 15
1594
- ];
1595
- const JOINER_PREFIX_LENGTHS = [
1596
- 80,
1597
- 60,
1598
- 40,
1599
- 30,
1600
- 20,
1601
- 15,
1602
- 12,
1603
- 10,
1604
- 8,
1605
- 6
1606
- ];
1607
- const STOP_CHARACTERS = /[\s\n.,;!?؛،۔۝۞]/;
1608
- /**
1609
- * Maximum allowed deviation between expected and actual boundary positions (characters).
1610
- * Matches outside this range are rejected unless `ignoreDeviation` is active.
1611
- */
1612
- const MAX_DEVIATION = 2e3;
1613
- //#endregion
1614
- //#region src/segmentation/match-utils.ts
1615
- /**
1616
- * Extracts named capture groups from a regex match.
1617
- *
1618
- * Only includes groups that are in the `captureNames` list and have
1619
- * defined values. This filters out positional captures and ensures
1620
- * only explicitly requested named captures are returned.
1621
- *
1622
- * @param groups - The `match.groups` object from `RegExp.exec()`
1623
- * @param captureNames - List of capture names to extract (from `{{token:name}}` syntax)
1624
- * @returns Object with capture name → value pairs, or `undefined` if none found
1625
- *
1626
- * @example
1627
- * const match = /(?<num>[٠-٩]+) -/.exec('٦٦٩٦ - text');
1628
- * extractNamedCaptures(match.groups, ['num'])
1629
- * // → { num: '٦٦٩٦' }
1630
- *
1631
- * @example
1632
- * // No matching captures
1633
- * extractNamedCaptures({}, ['num'])
1634
- * // → undefined
1635
- *
1636
- * @example
1637
- * // Undefined groups
1638
- * extractNamedCaptures(undefined, ['num'])
1639
- * // → undefined
1640
- */
1641
- const extractNamedCaptures = (groups, captureNames) => {
1642
- if (!groups || captureNames.length === 0) return;
1643
- const namedCaptures = {};
1644
- for (const name of captureNames) if (groups[name] !== void 0) namedCaptures[name] = groups[name];
1645
- return Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0;
1646
- };
1647
- /**
1648
- * Gets the last defined positional capture group from a match array.
1649
- *
1650
- * Used for `lineStartsAfter` patterns where the content capture (`.*`)
1651
- * is always at the end of the pattern. Named captures may shift the
1652
- * positional indices, so we iterate backward to find the actual content.
1653
- *
1654
- * @param match - RegExp exec result array
1655
- * @returns The last defined capture group value, or `undefined` if none
1656
- *
1657
- * @example
1658
- * // Pattern: ^(?:(?<num>[٠-٩]+) - )(.*)
1659
- * // Match array: ['٦٦٩٦ - content', '٦٦٩٦', 'content']
1660
- * getLastPositionalCapture(match)
1661
- * // → 'content'
1662
- *
1663
- * @example
1664
- * // No captures
1665
- * getLastPositionalCapture(['full match'])
1666
- * // → undefined
1667
- */
1668
- const getLastPositionalCapture = (match) => {
1669
- if (match.length <= 1) return;
1670
- for (let i = match.length - 1; i >= 1; i--) if (match[i] !== void 0) return match[i];
1671
- };
1672
- /**
1673
- * Filters matches to only include those within page ID constraints.
1674
- *
1675
- * Applies the `min`, `max`, and `exclude` constraints from a rule to filter out
1676
- * matches that occur on pages outside the allowed range or explicitly excluded.
1677
- *
1678
- * @param matches - Array of match results to filter
1679
- * @param rule - Rule containing `min`, `max`, and/or `exclude` page constraints
1680
- * @param getId - Function that returns the page ID for a given offset
1681
- * @returns Filtered array containing only matches within constraints
1682
- *
1683
- * @example
1684
- * const matches = [
1685
- * { start: 0, end: 10 }, // Page 1
1686
- * { start: 100, end: 110 }, // Page 5
1687
- * { start: 200, end: 210 }, // Page 10
1688
- * ];
1689
- * filterByConstraints(matches, { min: 3, max: 8 }, getId)
1690
- * // → [{ start: 100, end: 110 }] (only page 5 match)
1691
- */
1692
- const filterByConstraints = (matches, rule, getId) => matches.filter((m) => {
1693
- const id = getId(m.start);
1694
- return (rule.min === void 0 || id >= rule.min) && (rule.max === void 0 || id <= rule.max) && !isPageExcluded(id, rule.exclude);
1695
- });
1696
- /**
1697
- * Checks if any rule in the list allows the given page ID.
1698
- *
1699
- * A rule allows an ID if it falls within the rule's `min`/`max` constraints.
1700
- * Rules without constraints allow all page IDs.
1701
- *
1702
- * This is used to determine whether to create a segment for content
1703
- * that appears before any split points (the "first segment").
1704
- *
1705
- * @param rules - Array of rules with optional `min` and `max` constraints
1706
- * @param pageId - Page ID to check
1707
- * @returns `true` if at least one rule allows the page ID
1708
- *
1709
- * @example
1710
- * const rules = [
1711
- * { min: 5, max: 10 }, // Allows pages 5-10
1712
- * { min: 20 }, // Allows pages 20+
1713
- * ];
1714
- *
1715
- * anyRuleAllowsId(rules, 7) // → true (first rule allows)
1716
- * anyRuleAllowsId(rules, 3) // → false (no rule allows)
1717
- * anyRuleAllowsId(rules, 25) // → true (second rule allows)
1718
- *
1719
- * @example
1720
- * // Rules without constraints allow everything
1721
- * anyRuleAllowsId([{}], 999) // → true
1722
- */
1723
- const anyRuleAllowsId = (rules, pageId) => rules.some((r) => (r.min === void 0 || pageId >= r.min) && (r.max === void 0 || pageId <= r.max));
1724
- const extractDebugIndex = (groups, prefix) => {
1725
- if (!groups) return;
1726
- for (const key in groups) if (key.startsWith(prefix) && groups[key] !== void 0) {
1727
- const idx = Number.parseInt(key.slice(prefix.length), 10);
1728
- if (!Number.isNaN(idx)) return idx;
1729
- }
1730
- };
1731
- //#endregion
1732
- //#region src/segmentation/breakpoint-utils.ts
1733
- /**
1734
- * Escapes regex metacharacters outside of `{{token}}` delimiters.
1735
- *
1736
- * This allows words in the `words` field to contain tokens while treating
1737
- * most other characters as literals.
1738
- *
1739
- * Note: `()[]` are NOT escaped here because `processPattern` will handle them
1740
- * via `escapeTemplateBrackets`. This avoids double-escaping.
1741
- *
1742
- * @param word - Word string that may contain {{tokens}}
1743
- * @returns String with metacharacters escaped outside tokens (except ()[] which are escaped by processPattern)
1744
- *
1745
- * @example
1746
- * escapeWordsOutsideTokens('a.*b')
1747
- * // → 'a\\.\\*b'
1748
- *
1749
- * escapeWordsOutsideTokens('{{naql}}.test')
1750
- * // → '{{naql}}\\.test'
1751
- *
1752
- * escapeWordsOutsideTokens('(literal)')
1753
- * // → '(literal)' (not escaped here - processPattern handles it)
1754
- */
1755
- const escapeWordsOutsideTokens = (word) => word.split(/(\{\{[^}]+\}\})/g).map((part) => part.startsWith("{{") && part.endsWith("}}") ? part : part.replace(/[.*+?^${}|\\]/g, "\\$&")).join("");
1756
- /**
1757
- * Normalizes a breakpoint to the object form.
1758
- * Strings are converted to { pattern: str, split: 'after' } with no constraints.
1759
- * Invalid `split` values are treated as `'after'` for backward compatibility.
1760
- * If both `pattern` and `regex` are specified, `regex` takes precedence.
1761
- *
1762
- * When `words` is specified:
1763
- * - Defaults `split` to `'at'` (can be overridden)
1764
- * - Throws if combined with `pattern` or `regex`
1765
- *
1766
- * @param bp - Breakpoint as string or object
1767
- * @returns Normalized BreakpointRule object with resolved pattern/regex
1768
- *
1769
- * @example
1770
- * normalizeBreakpoint('\\n\\n')
1771
- * // → { pattern: '\\n\\n', split: 'after' }
1772
- *
1773
- * normalizeBreakpoint({ pattern: '\\n', min: 10 })
1774
- * // → { pattern: '\\n', min: 10, split: 'after' }
1775
- *
1776
- * normalizeBreakpoint({ pattern: 'X', split: 'at' })
1777
- * // → { pattern: 'X', split: 'at' }
1778
- *
1779
- * normalizeBreakpoint({ words: ['فهذا', 'ثم'] })
1780
- * // → { words: ['فهذا', 'ثم'], split: 'at' }
1781
- */
1782
- const normalizeBreakpoint = (bp) => {
1783
- if (typeof bp === "string") return {
1784
- pattern: bp,
1785
- split: "after"
1786
- };
1787
- if (bp.words && (bp.pattern !== void 0 || bp.regex !== void 0)) throw new Error("BreakpointRule: \"words\" cannot be combined with \"pattern\" or \"regex\"");
1788
- const defaultSplit = bp.words ? "at" : "after";
1789
- const split = bp.split === "at" || bp.split === "after" ? bp.split : defaultSplit;
1790
- return {
1791
- ...bp,
1792
- split
1793
- };
1794
- };
1795
- /**
1796
- * Checks if a page ID is in an excluded list (single pages or ranges).
1797
- *
1798
- * @param pageId - Page ID to check
1799
- * @param excludeList - List of page IDs or [from, to] ranges to exclude
1800
- * @returns True if page is excluded
1801
- *
1802
- * @example
1803
- * isPageExcluded(5, [1, 5, 10])
1804
- * // → true
1805
- *
1806
- * isPageExcluded(5, [[3, 7]])
1807
- * // → true
1808
- *
1809
- * isPageExcluded(5, [[10, 20]])
1810
- * // → false
1811
- */
1812
- const isPageExcluded = (pageId, excludeList) => excludeList?.some((item) => typeof item === "number" ? pageId === item : pageId >= item[0] && pageId <= item[1]) ?? false;
1813
- /**
1814
- * Checks if a page ID is within a breakpoint's min/max range and not excluded.
1815
- *
1816
- * @param pageId - Page ID to check
1817
- * @param rule - Breakpoint rule with optional min/max/exclude constraints
1818
- * @returns True if page is within valid range
1819
- *
1820
- * @example
1821
- * isInBreakpointRange(50, { pattern: '\\n', min: 10, max: 100 })
1822
- * // → true
1823
- *
1824
- * isInBreakpointRange(5, { pattern: '\\n', min: 10 })
1825
- * // → false (below min)
1826
- */
1827
- const isInBreakpointRange = (pageId, rule) => {
1828
- const { min, max, exclude } = rule;
1829
- return (min === void 0 || pageId >= min) && (max === void 0 || pageId <= max) && !isPageExcluded(pageId, exclude);
1830
- };
1831
- /**
1832
- * Builds an exclude set from a PageRange array for O(1) lookups.
1833
- *
1834
- * @param excludeList - List of page IDs or [from, to] ranges
1835
- * @returns Set of all excluded page IDs
1836
- *
1837
- * @remarks
1838
- * This expands ranges into explicit page IDs for fast membership checks. For typical
1839
- * book-scale inputs (thousands of pages), this is small and keeps downstream logic
1840
- * simple and fast. If you expect extremely large ranges (e.g., millions of pages),
1841
- * consider avoiding broad excludes or introducing a range-based membership structure.
1842
- *
1843
- * @example
1844
- * buildExcludeSet([1, 5, [10, 12]])
1845
- * // → Set { 1, 5, 10, 11, 12 }
1846
- */
1847
- const buildExcludeSet = (excludeList) => {
1848
- const excludeSet = /* @__PURE__ */ new Set();
1849
- for (const item of excludeList || []) if (typeof item === "number") excludeSet.add(item);
1850
- else for (let i = item[0]; i <= item[1]; i++) excludeSet.add(i);
1851
- return excludeSet;
1852
- };
1853
- /**
1854
- * Creates a segment with optional to and meta fields.
1855
- * Returns null if content is empty after trimming.
1856
- *
1857
- * @param content - Segment content
1858
- * @param fromPageId - Starting page ID
1859
- * @param toPageId - Optional ending page ID (omitted if same as from)
1860
- * @param meta - Optional metadata to attach
1861
- * @returns Segment object or null if empty
1862
- *
1863
- * @example
1864
- * createSegment('Hello world', 1, 3, { chapter: 1 })
1865
- * // → { content: 'Hello world', from: 1, to: 3, meta: { chapter: 1 } }
1866
- *
1867
- * createSegment(' ', 1, undefined, undefined)
1868
- * // → null (empty content)
1869
- */
1870
- const createSegment = (content, fromPageId, toPageId, meta) => {
1871
- const trimmed = content.trim();
1872
- if (!trimmed) return null;
1873
- return {
1874
- content: trimmed,
1875
- from: fromPageId,
1876
- ...toPageId !== void 0 && toPageId !== fromPageId && { to: toPageId },
1877
- ...meta && { meta }
1878
- };
1879
- };
1880
- /**
1881
- * Expands breakpoint patterns and pre-computes exclude sets.
1882
- *
1883
- * @param breakpoints - Array of breakpoint patterns or rules
1884
- * @param processPattern - Function to expand tokens in patterns
1885
- * @returns Array of expanded breakpoints with compiled regexes
1886
- *
1887
- * @remarks
1888
- * This function compiles regex patterns dynamically. This can be a ReDoS vector
1889
- * if patterns come from untrusted sources. In typical usage, breakpoint rules
1890
- * are application configuration, not user input.
1891
- /**
1892
- * @param processPattern - Function to expand tokens in patterns (with bracket escaping)
1893
- * @param processRawPattern - Function to expand tokens without bracket escaping (for regex field)
1894
- */
1895
- /**
1896
- * Builds regex source from words array.
1897
- * Words are escaped, processed, sorted by length, and joined with alternation.
1898
- */
1899
- const buildWordsRegex = (words, processPattern) => {
1900
- const processed = words.map((w, i) => ({
1901
- originalIndex: i,
1902
- w: w.trimStart()
1903
- })).filter(({ w }) => w.length > 0).map(({ w, originalIndex }) => ({
1904
- originalIndex,
1905
- pattern: processPattern(escapeWordsOutsideTokens(w))
1906
- }));
1907
- if (processed.length === 0) return null;
1908
- const seen = /* @__PURE__ */ new Set();
1909
- const unique = [];
1910
- for (const item of processed) if (!seen.has(item.pattern)) {
1911
- seen.add(item.pattern);
1912
- unique.push(item);
1913
- }
1914
- unique.sort((a, b) => b.pattern.length - a.pattern.length);
1915
- return `\\s+(?:${unique.map((item) => `(?<_w${item.originalIndex}>${item.pattern})`).join("|")})`;
1916
- };
1917
- /** Compiles skipWhen pattern to regex, or null if not present. */
1918
- const compileSkipWhenRegex = (rule, processPattern) => {
1919
- if (rule.skipWhen === void 0) return null;
1920
- const expandedSkip = processPattern(rule.skipWhen);
1921
- try {
1922
- return new RegExp(expandedSkip, "mu");
1923
- } catch (error) {
1924
- const message = error instanceof Error ? error.message : String(error);
1925
- throw new Error(`Invalid breakpoint skipWhen regex: ${rule.skipWhen}\n Cause: ${message}`);
1926
- }
1927
- };
1928
- /** Compiles a regex from a pattern string, throws descriptive error on failure. */
1929
- const compilePatternRegex = (pattern, fieldName) => {
1930
- try {
1931
- return new RegExp(pattern, "gmu");
1932
- } catch (error) {
1933
- const message = error instanceof Error ? error.message : String(error);
1934
- throw new Error(`Invalid breakpoint ${fieldName}: ${pattern}\n Cause: ${message}`);
1935
- }
1936
- };
1937
- /** Expands a single breakpoint to its expanded form. */
1938
- const expandSingleBreakpoint = (bp, processPattern, processRawPattern) => {
1939
- const rule = normalizeBreakpoint(bp);
1940
- const excludeSet = buildExcludeSet(rule.exclude);
1941
- const skipWhenRegex = compileSkipWhenRegex(rule, processPattern);
1942
- if (rule.words !== void 0) {
1943
- const wordsPattern = buildWordsRegex(rule.words, processPattern);
1944
- if (wordsPattern === null) return null;
1945
- return {
1946
- excludeSet,
1947
- regex: compilePatternRegex(wordsPattern, `words: ${rule.words.join(", ")}`),
1948
- rule,
1949
- skipWhenRegex,
1950
- splitAt: rule.split === "at"
1951
- };
1952
- }
1953
- const rawPattern = rule.regex ?? rule.pattern;
1954
- if (rawPattern === "" || rawPattern === void 0) return {
1955
- excludeSet,
1956
- regex: null,
1957
- rule,
1958
- skipWhenRegex,
1959
- splitAt: false
1960
- };
1961
- return {
1962
- excludeSet,
1963
- regex: compilePatternRegex(rule.regex !== void 0 && processRawPattern ? processRawPattern(rawPattern) : processPattern(rawPattern), rule.regex !== void 0 ? "regex" : "pattern"),
1964
- rule,
1965
- skipWhenRegex,
1966
- splitAt: rule.split === "at"
1967
- };
1968
- };
1969
- const expandBreakpoints = (breakpoints, processPattern, processRawPattern) => breakpoints.map((bp) => expandSingleBreakpoint(bp, processPattern, processRawPattern)).filter((bp) => bp !== null);
1970
- /**
1971
- * Applies a configured joiner at detected page boundaries within a multi-page content chunk.
1972
- *
1973
- * This is used for breakpoint-generated segments which don't have access to the original
1974
- * `pageMap.pageBreaks` offsets. We detect page starts sequentially by searching for each page's
1975
- * prefix after the previous boundary, then replace ONLY the single newline immediately before
1976
- * that page start.
1977
- *
1978
- * This avoids converting real in-page newlines, while still normalizing page joins consistently.
1979
- */
1980
- const applyPageJoinerBetweenPages = (content, fromIdx, toIdx, pageIds, normalizedPages, joiner) => {
1981
- if (joiner === "newline" || fromIdx >= toIdx || !content.includes("\n")) return content;
1982
- let updated = content;
1983
- let searchFrom = 0;
1984
- for (let pi = fromIdx + 1; pi <= toIdx; pi++) {
1985
- const pageData = normalizedPages.get(pageIds[pi]);
1986
- if (!pageData) continue;
1987
- const found = findPrefixPositionInContent(updated, pageData.content.trimStart(), searchFrom);
1988
- if (found > 0 && updated[found - 1] === "\n") updated = `${updated.slice(0, found - 1)} ${updated.slice(found)}`;
1989
- if (found > 0) searchFrom = found;
1990
- }
1991
- return updated;
1992
- };
1993
- /**
1994
- * Finds the position of a page prefix in content, trying multiple prefix lengths.
1995
- */
1996
- const findPrefixPositionInContent = (content, trimmedPageContent, searchFrom) => {
1997
- for (const len of JOINER_PREFIX_LENGTHS) {
1998
- const prefix = trimmedPageContent.slice(0, Math.min(len, trimmedPageContent.length)).trim();
1999
- if (!prefix) continue;
2000
- const pos = content.indexOf(prefix, searchFrom);
2001
- if (pos > 0) return pos;
2002
- }
2003
- return -1;
2004
- };
2005
- /**
2006
- * Estimates how far into the current page `remainingContent` begins.
2007
- *
2008
- * During breakpoint processing, `remainingContent` can begin mid-page after a previous split.
2009
- * When that happens, raw cumulative page offsets (computed from full page starts) can overestimate
2010
- * expected boundary positions. This helper computes an approximate starting offset by matching
2011
- * a short prefix of `remainingContent` inside the current page content.
2012
- */
2013
- const estimateStartOffsetInCurrentPage = (remainingContent, currentFromIdx, pageIds, normalizedPages) => {
2014
- const currentPageData = normalizedPages.get(pageIds[currentFromIdx]);
2015
- if (!currentPageData) return 0;
2016
- const remPrefix = remainingContent.slice(0, 500).trimStart();
2017
- if (!remPrefix) return 0;
2018
- const maxNeedleLen = Math.min(30, remPrefix.length);
2019
- for (let len = maxNeedleLen; len >= 5; len -= 5) {
2020
- const needle = remPrefix.slice(0, len);
2021
- const idx = currentPageData.content.indexOf(needle);
2022
- if (idx >= 0) return idx;
2023
- }
2024
- if (remPrefix.length >= 3) {
2025
- const needle = remPrefix.slice(0, 3);
2026
- const idx = currentPageData.content.indexOf(needle);
2027
- if (idx >= 0) return idx;
2028
- }
2029
- return 0;
2030
- };
2031
- const estimateStartOffsetInCurrentPageFromEnd = (remainingContent, currentFromIdx, pageIds, normalizedPages) => {
2032
- const currentPageData = normalizedPages.get(pageIds[currentFromIdx]);
2033
- if (!currentPageData) return 0;
2034
- const remPrefix = remainingContent.slice(0, 500).trimStart();
2035
- if (!remPrefix) return 0;
2036
- const maxNeedleLen = Math.min(30, remPrefix.length);
2037
- for (let len = maxNeedleLen; len >= 5; len -= 5) {
2038
- const needle = remPrefix.slice(0, len);
2039
- const idx = currentPageData.content.lastIndexOf(needle);
2040
- if (idx >= 0) return idx;
2041
- }
2042
- if (remPrefix.length >= 3) {
2043
- const needle = remPrefix.slice(0, 3);
2044
- const idx = currentPageData.content.lastIndexOf(needle);
2045
- if (idx >= 0) return idx;
2046
- }
2047
- return 0;
2048
- };
2049
- const selectStartOffsetInCurrentPage = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger) => {
2050
- const first = estimateStartOffsetInCurrentPage(segmentContent, fromIdx, pageIds, normalizedPages);
2051
- const last = estimateStartOffsetInCurrentPageFromEnd(segmentContent, fromIdx, pageIds, normalizedPages);
2052
- const candidates = [...new Set([first, last])];
2053
- if (candidates.length <= 1 || fromIdx + 1 > toIdx) return candidates[0] ?? 0;
2054
- const rawBoundary = cumulativeOffsets[fromIdx + 1] !== void 0 && cumulativeOffsets[fromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[fromIdx + 1] - cumulativeOffsets[fromIdx]) : void 0;
2055
- if (rawBoundary === void 0) return candidates[0] ?? 0;
2056
- let best = candidates[0] ?? 0;
2057
- let bestScore = Number.POSITIVE_INFINITY;
2058
- for (const candidate of candidates) {
2059
- const expectedBoundary = Math.max(0, rawBoundary - candidate);
2060
- const pos = findPageStartNearExpectedBoundary(segmentContent, fromIdx + 1, expectedBoundary, pageIds, normalizedPages, logger);
2061
- if (pos > 0) {
2062
- const score = Math.abs(pos - expectedBoundary);
2063
- if (score < bestScore) {
2064
- bestScore = score;
2065
- best = candidate;
2066
- }
2067
- }
2068
- }
2069
- return best;
2070
- };
2071
- /**
2072
- * Attempts to find the start position of a target page within remainingContent,
2073
- * anchored near an expected boundary position to reduce collisions.
2074
- *
2075
- * This is used to define breakpoint windows in terms of actual content being split, rather than
2076
- * raw per-page offsets which can desync when structural rules strip markers.
2077
- */
2078
- const findPageStartNearExpectedBoundary = (remainingContent, targetPageIdx, expectedBoundary, pageIds, normalizedPages, logger) => {
2079
- const targetPageData = normalizedPages.get(pageIds[targetPageIdx]);
2080
- if (!targetPageData) return -1;
2081
- const approx = Math.min(Math.max(0, expectedBoundary), remainingContent.length);
2082
- const searchStart = Math.max(0, approx - 1e4);
2083
- const searchEnd = Math.min(remainingContent.length, approx + 2e3);
2084
- const targetTrimmed = targetPageData.content.trimStart();
2085
- const ignoreDeviation = expectedBoundary >= remainingContent.length;
2086
- const scanStart = ignoreDeviation ? 0 : searchStart;
2087
- const scanEnd = ignoreDeviation ? remainingContent.length : searchEnd;
2088
- const expectedForRanking = ignoreDeviation ? 0 : expectedBoundary;
2089
- for (const len of WINDOW_PREFIX_LENGTHS) {
2090
- const prefix = targetTrimmed.slice(0, Math.min(len, targetTrimmed.length)).trim();
2091
- if (!prefix) continue;
2092
- const candidates = findAnchorCandidates(remainingContent, prefix, scanStart, scanEnd);
2093
- if (candidates.length === 0) continue;
2094
- const deviationLimit = ignoreDeviation ? Number.POSITIVE_INFINITY : MAX_DEVIATION;
2095
- const inRange = candidates.filter((c) => Math.abs(c.pos - expectedBoundary) <= deviationLimit);
2096
- if (inRange.length > 0) return selectBestAnchor(inRange, expectedForRanking).pos;
2097
- const bestOverall = selectBestAnchor(candidates, expectedForRanking);
2098
- logger?.debug?.("[breakpoints] findPageStartNearExpectedBoundary: Rejected match exceeding deviation", {
2099
- bestDistance: Math.abs(bestOverall.pos - expectedForRanking),
2100
- expectedBoundary,
2101
- matchPos: bestOverall.pos,
2102
- maxDeviation: deviationLimit,
2103
- prefixLength: len,
2104
- targetPageIdx
2105
- });
2106
- }
2107
- return -1;
2108
- };
2109
- /** Finds all whitespace-preceded occurrences of a prefix within a search range */
2110
- const findAnchorCandidates = (content, prefix, start, end) => {
2111
- const candidates = [];
2112
- let pos = content.indexOf(prefix, start);
2113
- while (pos !== -1 && pos <= end) {
2114
- if (pos > 0) {
2115
- const charBefore = content[pos - 1];
2116
- if (charBefore === "\n") candidates.push({
2117
- isNewline: true,
2118
- pos
2119
- });
2120
- else if (/\s/.test(charBefore)) candidates.push({
2121
- isNewline: false,
2122
- pos
2123
- });
2124
- }
2125
- pos = content.indexOf(prefix, pos + 1);
2126
- }
2127
- return candidates;
2128
- };
2129
- /** Selects the best anchor candidate, prioritizing newlines then proximity to boundary */
2130
- const selectBestAnchor = (candidates, expectedBoundary) => {
2131
- return candidates.reduce((best, curr) => {
2132
- const bestScore = Math.abs(best.pos - expectedBoundary) + (best.isNewline ? 0 : 20);
2133
- return Math.abs(curr.pos - expectedBoundary) + (curr.isNewline ? 0 : 20) < bestScore ? curr : best;
2134
- });
2135
- };
2136
- /**
2137
- * Finds the start position of a target page after a minimum position.
2138
- * Used to avoid duplicate earlier matches when content repeats.
2139
- */
2140
- const findPageStartAfterPosition = (remainingContent, targetPageIdx, minPos, pageIds, normalizedPages) => {
2141
- const targetPageData = normalizedPages.get(pageIds[targetPageIdx]);
2142
- if (!targetPageData) return -1;
2143
- const targetTrimmed = targetPageData.content.trimStart();
2144
- for (const len of WINDOW_PREFIX_LENGTHS) {
2145
- const prefix = targetTrimmed.slice(0, Math.min(len, targetTrimmed.length)).trim();
2146
- if (!prefix) continue;
2147
- const after = findAnchorCandidates(remainingContent, prefix, Math.max(0, minPos), remainingContent.length).filter((c) => c.pos > minPos);
2148
- if (after.length > 0) return selectBestAnchor(after, minPos).pos;
2149
- }
2150
- return -1;
2151
- };
2152
- const buildBoundaryPositionsFastPath = (segmentContent, fromIdx, toIdx, pageCount, cumulativeOffsets, logger) => {
2153
- const boundaryPositions = [0];
2154
- logger?.debug?.("[breakpoints] Using fast-path for large segment in buildBoundaryPositions", {
2155
- fromIdx,
2156
- pageCount,
2157
- toIdx
2158
- });
2159
- const baseOffset = cumulativeOffsets[fromIdx] ?? 0;
2160
- for (let i = fromIdx + 1; i <= toIdx; i++) {
2161
- const offset = cumulativeOffsets[i];
2162
- if (offset !== void 0) {
2163
- const boundary = Math.max(0, offset - baseOffset);
2164
- const prevBoundary = boundaryPositions[boundaryPositions.length - 1];
2165
- boundaryPositions.push(Math.max(prevBoundary + 1, Math.min(boundary, segmentContent.length)));
2166
- }
2167
- }
2168
- boundaryPositions.push(segmentContent.length);
2169
- return boundaryPositions;
2170
- };
2171
- const isBoundaryPositionValid = (pos, prevBoundary, expectedBoundary, segmentLength, ignoreDeviation = false) => {
2172
- if (pos <= 0 || pos <= prevBoundary) return false;
2173
- if (ignoreDeviation) return true;
2174
- if (expectedBoundary >= segmentLength) return true;
2175
- const deviationLimit = MAX_DEVIATION;
2176
- return Math.abs(pos - expectedBoundary) < deviationLimit;
2177
- };
2178
- const resolveBoundaryMatch = (segmentContent, pageIdx, rawBoundary, startOffsetInFromPage, canInferStartOffset, pageIds, normalizedPages, logger) => {
2179
- let expectedBoundary = rawBoundary !== void 0 ? Math.max(0, rawBoundary - startOffsetInFromPage) : segmentContent.length;
2180
- let pos = findPageStartNearExpectedBoundary(segmentContent, pageIdx, expectedBoundary, pageIds, normalizedPages, logger);
2181
- let didInferStartOffset = false;
2182
- if (pos < 0 && canInferStartOffset && rawBoundary !== void 0) {
2183
- const relaxedPos = findPageStartNearExpectedBoundary(segmentContent, pageIdx, segmentContent.length, pageIds, normalizedPages, logger);
2184
- if (relaxedPos > 0) {
2185
- const inferredStartOffset = rawBoundary - relaxedPos;
2186
- const currentExpected = Math.max(0, rawBoundary - startOffsetInFromPage);
2187
- if (inferredStartOffset >= 0 && Math.abs(relaxedPos - currentExpected) < 500) {
2188
- startOffsetInFromPage = inferredStartOffset;
2189
- expectedBoundary = Math.max(0, rawBoundary - startOffsetInFromPage);
2190
- pos = relaxedPos;
2191
- didInferStartOffset = true;
2192
- }
2193
- }
2194
- }
2195
- return {
2196
- didInferStartOffset,
2197
- expectedBoundary,
2198
- pos,
2199
- startOffsetInFromPage
2200
- };
2201
- };
2202
- const buildBoundaryPositionsAccurate = (segmentContent, fromIdx, toIdx, pageCount, pageIds, normalizedPages, cumulativeOffsets, logger) => {
2203
- const boundaryPositions = [0];
2204
- logger?.debug?.("[breakpoints] buildBoundaryPositions: Using accurate string-search path", {
2205
- contentLength: segmentContent.length,
2206
- fromIdx,
2207
- pageCount,
2208
- toIdx
2209
- });
2210
- let startOffsetInFromPage = selectStartOffsetInCurrentPage(segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger);
2211
- let didInferStartOffset = false;
2212
- for (let i = fromIdx + 1; i <= toIdx; i++) {
2213
- const rawBoundary = cumulativeOffsets[i] !== void 0 && cumulativeOffsets[fromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[i] - cumulativeOffsets[fromIdx]) : void 0;
2214
- const resolved = resolveBoundaryMatch(segmentContent, i, rawBoundary, startOffsetInFromPage, !didInferStartOffset && i === fromIdx + 1, pageIds, normalizedPages, logger);
2215
- startOffsetInFromPage = resolved.startOffsetInFromPage;
2216
- didInferStartOffset = didInferStartOffset || resolved.didInferStartOffset;
2217
- const prevBoundary = boundaryPositions[boundaryPositions.length - 1];
2218
- let resolvedPos = resolved.pos;
2219
- if (resolvedPos <= prevBoundary) {
2220
- const afterPos = findPageStartAfterPosition(segmentContent, i, prevBoundary + 1, pageIds, normalizedPages);
2221
- if (afterPos > prevBoundary) resolvedPos = afterPos;
2222
- }
2223
- if (isBoundaryPositionValid(resolvedPos, prevBoundary, resolved.expectedBoundary, segmentContent.length)) boundaryPositions.push(resolvedPos);
2224
- else {
2225
- const estimate = Math.max(prevBoundary + 1, resolved.expectedBoundary);
2226
- boundaryPositions.push(Math.min(estimate, segmentContent.length));
2227
- }
2228
- }
2229
- boundaryPositions.push(segmentContent.length);
2230
- logger?.debug?.("[breakpoints] buildBoundaryPositions: Complete", { boundaryCount: boundaryPositions.length });
2231
- return boundaryPositions;
2232
- };
2233
- /**
2234
- * Builds a boundary position map for pages within the given range.
2235
- *
2236
- * This function computes page boundaries once per segment and enables
2237
- * O(log n) page lookups via binary search with `findPageIndexForPosition`.
2238
- *
2239
- * Boundaries are derived from segmentContent (post-structural-rules).
2240
- * When the segment starts mid-page, an offset correction is applied to
2241
- * keep boundary estimates aligned with the segment's actual content space.
2242
- *
2243
- * @param segmentContent - Full segment content (already processed by structural rules)
2244
- * @param fromIdx - Starting page index
2245
- * @param toIdx - Ending page index
2246
- * @param pageIds - Array of all page IDs
2247
- * @param normalizedPages - Map of page ID to normalized content
2248
- * @param cumulativeOffsets - Cumulative character offsets (for estimates)
2249
- * @param logger - Optional logger for debugging
2250
- * @returns Array where boundaryPositions[i] = start position of page (fromIdx + i),
2251
- * with a sentinel boundary at segmentContent.length as the last element
2252
- *
2253
- * @example
2254
- * // For a 3-page segment:
2255
- * buildBoundaryPositions(content, 0, 2, pageIds, normalizedPages, offsets)
2256
- * // → [0, 23, 45, 67] where 67 is content.length (sentinel)
2257
- */
2258
- const buildBoundaryPositions = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger) => {
2259
- const pageCount = toIdx - fromIdx + 1;
2260
- const expectedLength = (cumulativeOffsets[toIdx + 1] ?? 0) - (cumulativeOffsets[fromIdx] ?? 0);
2261
- if (pageCount >= 1e3 && segmentContent.length === expectedLength) return buildBoundaryPositionsFastPath(segmentContent, fromIdx, toIdx, pageCount, cumulativeOffsets, logger);
2262
- return buildBoundaryPositionsAccurate(segmentContent, fromIdx, toIdx, pageCount, pageIds, normalizedPages, cumulativeOffsets, logger);
2263
- };
2264
- /**
2265
- * Binary search to find which page a position falls within.
2266
- * Uses "largest i where boundaryPositions[i] <= position" semantics.
2267
- *
2268
- * @param position - Character position in segmentContent
2269
- * @param boundaryPositions - Precomputed boundary positions (from buildBoundaryPositions)
2270
- * @param fromIdx - Base page index (boundaryPositions[0] corresponds to pageIds[fromIdx])
2271
- * @returns Page index in pageIds array
2272
- *
2273
- * @example
2274
- * // With boundaries [0, 20, 40, 60] and fromIdx=0:
2275
- * findPageIndexForPosition(15, boundaries, 0) // → 0 (first page)
2276
- * findPageIndexForPosition(25, boundaries, 0) // → 1 (second page)
2277
- * findPageIndexForPosition(40, boundaries, 0) // → 2 (exactly on boundary = that page)
2278
- */
2279
- const findPageIndexForPosition = (position, boundaryPositions, fromIdx) => {
2280
- if (boundaryPositions.length <= 1) return fromIdx;
2281
- let left = 0;
2282
- let right = boundaryPositions.length - 2;
2283
- while (left < right) {
2284
- const mid = Math.ceil((left + right) / 2);
2285
- if (boundaryPositions[mid] <= position) left = mid;
2286
- else right = mid - 1;
2287
- }
2288
- return fromIdx + left;
2289
- };
2290
- /**
2291
- * Finds the end position of a breakpoint window inside `remainingContent`.
2292
- *
2293
- * The window end is defined as the start of the page AFTER `windowEndIdx` (i.e. `windowEndIdx + 1`),
2294
- * found within the actual `remainingContent` string being split. This avoids relying on raw page offsets
2295
- * that can diverge when structural rules strip markers (e.g. `lineStartsAfter`).
2296
- */
2297
- const findBreakpointWindowEndPosition = (remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger) => {
2298
- if (windowEndIdx >= toIdx) return remainingContent.length;
2299
- const desiredNextIdx = windowEndIdx + 1;
2300
- const minNextIdx = currentFromIdx + 1;
2301
- const maxNextIdx = Math.min(desiredNextIdx, toIdx);
2302
- const startOffsetInCurrentPage = estimateStartOffsetInCurrentPage(remainingContent, currentFromIdx, pageIds, normalizedPages);
2303
- let bestExpectedBoundary = remainingContent.length;
2304
- for (let nextIdx = maxNextIdx; nextIdx >= minNextIdx; nextIdx--) {
2305
- const expectedBoundary = cumulativeOffsets[nextIdx] !== void 0 && cumulativeOffsets[currentFromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[nextIdx] - cumulativeOffsets[currentFromIdx] - startOffsetInCurrentPage) : remainingContent.length;
2306
- if (nextIdx === maxNextIdx) bestExpectedBoundary = expectedBoundary;
2307
- const pos = findPageStartNearExpectedBoundary(remainingContent, nextIdx, expectedBoundary, pageIds, normalizedPages, logger);
2308
- if (pos > 0) return pos;
2309
- }
2310
- return Math.min(bestExpectedBoundary, remainingContent.length);
2311
- };
2312
- /**
2313
- * Finds exclusion-based break position using raw cumulative offsets.
2314
- *
2315
- * This is used to ensure pages excluded by breakpoints are never merged into the same output segment.
2316
- * Returns a break position relative to the start of `remainingContent` (i.e. the currentFromIdx start).
2317
- */
2318
- const findExclusionBreakPosition = (currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets) => {
2319
- const startingPageId = pageIds[currentFromIdx];
2320
- if (expandedBreakpoints.some((bp) => bp.excludeSet.has(startingPageId)) && currentFromIdx < toIdx) return cumulativeOffsets[currentFromIdx + 1] - cumulativeOffsets[currentFromIdx];
2321
- for (let pageIdx = currentFromIdx + 1; pageIdx <= windowEndIdx; pageIdx++) {
2322
- const pageId = pageIds[pageIdx];
2323
- if (expandedBreakpoints.some((bp) => bp.excludeSet.has(pageId))) return cumulativeOffsets[pageIdx] - cumulativeOffsets[currentFromIdx];
2324
- }
2325
- return -1;
2326
- };
2327
- /**
2328
- * Checks if any page in a range is excluded by the given exclude set.
2329
- *
2330
- * @param excludeSet - Set of excluded page IDs
2331
- * @param pageIds - Array of page IDs
2332
- * @param fromIdx - Start index (inclusive)
2333
- * @param toIdx - End index (inclusive)
2334
- * @returns True if any page in range is excluded
2335
- */
2336
- const hasExcludedPageInRange = (excludeSet, pageIds, fromIdx, toIdx) => {
2337
- if (excludeSet.size === 0) return false;
2338
- for (let pageIdx = fromIdx; pageIdx <= toIdx; pageIdx++) if (excludeSet.has(pageIds[pageIdx])) return true;
2339
- return false;
2340
- };
2341
- /**
2342
- * Finds the position of the next page content within remaining content.
2343
- * Returns -1 if not found.
2344
- *
2345
- * @param remainingContent - Content to search in
2346
- * @param nextPageData - Normalized data for the next page
2347
- * @returns Position of next page content, or -1 if not found
2348
- */
2349
- const findNextPagePosition = (remainingContent, nextPageData) => {
2350
- const searchPrefix = nextPageData.content.trim().slice(0, Math.min(30, nextPageData.length));
2351
- if (searchPrefix.length === 0) return -1;
2352
- const pos = remainingContent.indexOf(searchPrefix);
2353
- return pos > 0 ? pos : -1;
2354
- };
2355
- /**
2356
- * Finds matches within a window and returns the selected position based on preference and split mode.
2357
- *
2358
- * @param windowContent - Content to search
2359
- * @param regex - Regex to match
2360
- * @param prefer - 'longer' for last match, 'shorter' for first match
2361
- * @param splitAt - If true, return position BEFORE match (at index). If false, return position AFTER match (at index + length).
2362
- * @returns Break position, or -1 if no valid matches
2363
- *
2364
- * @remarks
2365
- * - Matches with length 0 are skipped (prevents infinite loops with lookahead patterns)
2366
- * - Matches that would result in position 0 are skipped (prevents empty first segments)
2367
- * - For prefer:'shorter', returns immediately on first valid match (optimization)
2368
- */
2369
- const findPatternBreakPosition = (windowContent, regex, prefer, splitAt = false) => {
2370
- let last;
2371
- for (const m of windowContent.matchAll(regex)) {
2372
- const idx = m.index ?? -1;
2373
- const len = m[0]?.length ?? 0;
2374
- if (idx < 0 || len === 0) continue;
2375
- const pos = splitAt ? idx : idx + len;
2376
- if (pos === 0) continue;
2377
- last = {
2378
- groups: m.groups,
2379
- index: idx,
2380
- length: len
2381
- };
2382
- if (prefer === "shorter") return {
2383
- groups: m.groups,
2384
- pos
2385
- };
2386
- }
2387
- if (!last) return { pos: -1 };
2388
- const finalPos = splitAt ? last.index : last.index + last.length;
2389
- return {
2390
- groups: last.groups,
2391
- pos: finalPos
2392
- };
2393
- };
2394
- /**
2395
- * Handles page boundary breakpoint (empty pattern).
2396
- * Returns break position or -1 if no valid position found.
2397
- */
2398
- const findStartOfNextPageInWindow = (remainingContent, currentFromIdx, toIdx, pageIds, normalizedPages, targetPos) => {
2399
- const targetNextPageIdx = currentFromIdx + 1;
2400
- for (let nextIdx = targetNextPageIdx; nextIdx > currentFromIdx; nextIdx--) if (nextIdx <= toIdx) {
2401
- const nextPageData = normalizedPages.get(pageIds[nextIdx]);
2402
- if (nextPageData) {
2403
- const boundaryPos = findNextPagePosition(remainingContent, nextPageData);
2404
- if (boundaryPos > 0 && boundaryPos <= targetPos) return boundaryPos;
2405
- }
2406
- }
2407
- return -1;
2408
- };
2409
- const handlePageBoundaryBreak = (remainingContent, currentFromIdx, windowEndPosition, maxContentLength, toIdx, pageIds, normalizedPages) => {
2410
- const targetPos = Math.min(windowEndPosition, remainingContent.length);
2411
- const isLengthBounded = maxContentLength !== void 0 && windowEndPosition === maxContentLength;
2412
- if (!isLengthBounded) {
2413
- const boundaryPos = findStartOfNextPageInWindow(remainingContent, currentFromIdx, toIdx, pageIds, normalizedPages, targetPos);
2414
- if (boundaryPos > 0) return { pos: boundaryPos };
2415
- }
2416
- if (targetPos < remainingContent.length) {
2417
- const safePos = findSafeBreakPosition(remainingContent, targetPos);
2418
- if (safePos !== -1) return {
2419
- pos: safePos,
2420
- splitReason: isLengthBounded ? "whitespace" : void 0
2421
- };
2422
- return {
2423
- pos: adjustForUnicodeBoundary(remainingContent, targetPos),
2424
- splitReason: isLengthBounded ? "unicode_boundary" : void 0
2425
- };
2426
- }
2427
- return { pos: targetPos };
2428
- };
2429
- const checkBreakpointMatch = (i, remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, ctx, maxContentLength) => {
2430
- const { pageIds, normalizedPages, expandedBreakpoints, prefer } = ctx;
2431
- const bpCtx = expandedBreakpoints[i];
2432
- const { rule, regex, excludeSet, skipWhenRegex } = bpCtx;
2433
- if (!isInBreakpointRange(pageIds[currentFromIdx], rule)) return null;
2434
- if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) return null;
2435
- if (skipWhenRegex?.test(remainingContent)) return null;
2436
- if (regex === null) {
2437
- const result = handlePageBoundaryBreak(remainingContent, currentFromIdx, windowEndPosition, maxContentLength, toIdx, pageIds, normalizedPages);
2438
- return {
2439
- breakPos: result.pos,
2440
- breakpointIndex: i,
2441
- contentLengthSplit: result.splitReason && maxContentLength ? {
2442
- maxContentLength,
2443
- reason: result.splitReason
2444
- } : void 0,
2445
- rule
2446
- };
2447
- }
2448
- const { pos: breakPos, groups } = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer, bpCtx.splitAt);
2449
- if (breakPos > 0) return {
2450
- breakPos,
2451
- breakpointIndex: i,
2452
- rule,
2453
- wordIndex: extractDebugIndex(groups, "_w")
2454
- };
2455
- return null;
2456
- };
2457
- /**
2458
- * Tries to find a break position within the current window using breakpoint patterns.
2459
- * Returns the break position or -1 if no suitable break was found.
2460
- *
2461
- * @param remainingContent - Content remaining to be segmented
2462
- * @param currentFromIdx - Current starting page index
2463
- * @param toIdx - Ending page index
2464
- * @param windowEndIdx - Maximum window end index
2465
- * @param ctx - Breakpoint context with page data and patterns
2466
- * @returns Break position in the content, or -1 if no break found
2467
- */
2468
- const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, ctx, maxContentLength) => {
2469
- const { expandedBreakpoints } = ctx;
2470
- for (let i = 0; i < expandedBreakpoints.length; i++) {
2471
- const match = checkBreakpointMatch(i, remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, ctx, maxContentLength);
2472
- if (match) return match;
2473
- }
2474
- return null;
2475
- };
2476
- /**
2477
- * Searches backward from a target position to find a "safe" split point.
2478
- * A safe split point is after whitespace or punctuation.
2479
- *
2480
- * @param content The text content
2481
- * @param targetPosition The desired split position (hard limit)
2482
- * @param lookbackChars How far back to search for a safe break
2483
- * @returns The new split position (index), or -1 if no safe break found
2484
- */
2485
- const findSafeBreakPosition = (content, targetPosition, lookbackChars = 100) => {
2486
- const startSearch = Math.max(0, targetPosition - lookbackChars);
2487
- for (let i = targetPosition - 1; i >= startSearch; i--) {
2488
- const char = content[i];
2489
- if (STOP_CHARACTERS.test(char)) return i + 1;
2490
- }
2491
- return -1;
2492
- };
2493
- //#endregion
2494
- //#region src/segmentation/debug-meta.ts
2495
- const resolveDebugConfig = (debug) => {
2496
- if (debug === true) return {
2497
- includeBreakpoint: true,
2498
- includeRule: true,
2499
- metaKey: "_flappa"
2500
- };
2501
- if (!debug || typeof debug !== "object") return null;
2502
- const { metaKey, include } = debug;
2503
- const includeRule = Array.isArray(include) ? include.includes("rule") : true;
2504
- return {
2505
- includeBreakpoint: Array.isArray(include) ? include.includes("breakpoint") : true,
2506
- includeRule,
2507
- metaKey: typeof metaKey === "string" && metaKey ? metaKey : "_flappa"
2508
- };
2509
- };
2510
- const getRulePatternType = (rule) => {
2511
- return PATTERN_TYPE_KEYS.find((key) => key in rule) ?? "regex";
2512
- };
2513
- const isPlainObject = (v) => Boolean(v) && typeof v === "object" && !Array.isArray(v);
2514
- const mergeDebugIntoMeta = (meta, metaKey, patch) => {
2515
- const out = meta ? { ...meta } : {};
2516
- const existing = out[metaKey];
2517
- out[metaKey] = {
2518
- ...isPlainObject(existing) ? existing : {},
2519
- ...patch
2520
- };
2521
- return out;
2522
- };
2523
- const buildRuleDebugPatch = (ruleIndex, rule, wordIndex) => {
2524
- const patternType = getRulePatternType(rule);
2525
- const patterns = rule[patternType];
2526
- const word = wordIndex !== void 0 && Array.isArray(patterns) && patterns[wordIndex] !== void 0 ? patterns[wordIndex] : void 0;
2527
- return { rule: {
2528
- index: ruleIndex,
2529
- patternType,
2530
- ...wordIndex !== void 0 ? { wordIndex } : {},
2531
- ...word !== void 0 ? { word } : {}
2532
- } };
2533
- };
2534
- const buildBreakpointDebugPatch = (breakpointIndex, rule, wordIndex) => ({ breakpoint: {
2535
- index: breakpointIndex,
2536
- kind: rule.pattern === "" ? "pageBoundary" : rule.regex ? "regex" : "pattern",
2537
- pattern: rule.pattern ?? rule.regex,
2538
- ...wordIndex !== void 0 ? { wordIndex } : {},
2539
- ...wordIndex !== void 0 && rule.words ? { word: rule.words[wordIndex] } : {}
2540
- } });
2541
- /**
2542
- * Helper to format the debug info into a human-readable string.
2543
- * @param meta - The segment metadata object
2544
- * @param options - Formatting options
2545
- */
2546
- const formatRuleReason = (rule, concise) => {
2547
- const { index, patternType, wordIndex, word } = rule;
2548
- if (concise) return `Rule: ${word ? `"${word}"` : patternType}`;
2549
- const wordInfo = word ? ` (Matched: "${word}")` : "";
2550
- return `Rule #${index} (${patternType})${wordIndex !== void 0 ? ` [idx:${wordIndex}]` : ""}${wordInfo}`;
2551
- };
2552
- const formatBreakpointReason = (breakpoint, concise) => {
2553
- const { index, kind, pattern, wordIndex, word } = breakpoint;
2554
- if (kind === "pageBoundary") return concise ? "Breakpoint: <page-boundary>" : "Page Boundary (Fallback)";
2555
- if (concise) return `Breakpoint: ${word ? `"${word}"` : `"${pattern}"`}`;
2556
- if (word) return `Breakpoint #${index} (Words) [idx:${wordIndex}] - "${word}"`;
2557
- return `Breakpoint #${index} (${kind}) - "${pattern}"`;
2558
- };
2559
- const formatContentLengthReason = (split, concise) => {
2560
- const { maxContentLength, splitReason } = split;
2561
- if (concise) return `> ${maxContentLength} (${splitReason})`;
2562
- return `Safety Split (${splitReason}) > ${maxContentLength}`;
2563
- };
2564
- /**
2565
- * Helper to format the debug info into a human-readable string.
2566
- * @param meta - The segment metadata object
2567
- * @param options - Formatting options
2568
- */
2569
- const getDebugReason = (meta, options) => {
2570
- const debug = meta?._flappa;
2571
- if (!debug) return "-";
2572
- const concise = options?.concise;
2573
- if (debug.rule) return formatRuleReason(debug.rule, concise);
2574
- if (debug.breakpoint) return formatBreakpointReason(debug.breakpoint, concise);
2575
- if (debug.contentLengthSplit) return formatContentLengthReason(debug.contentLengthSplit, concise);
2576
- return "Unknown";
2577
- };
2578
- /**
2579
- * Convenience helper to get the formatted debug reason directly from a segment.
2580
- * @param segment - The segment object
2581
- * @param options - Formatting options
2582
- */
2583
- const getSegmentDebugReason = (segment, options) => {
2584
- return getDebugReason(segment.meta, options);
2585
- };
2586
- //#endregion
2587
- //#region src/segmentation/pattern-validator.ts
2588
- const KNOWN_TOKENS = new Set(getAvailableTokens());
2589
- const TOKEN_INSIDE_BRACES = /\{\{(\w+)(?::\w+)?\}\}/g;
2590
- const buildBareTokenRegex = () => {
2591
- const tokens = [...KNOWN_TOKENS].sort((a, b) => b.length - a.length);
2592
- return new RegExp(`(?<!\\{\\{)(${tokens.join("|")})(?::\\w+)?(?!\\}\\})`, "g");
2593
- };
2594
- /**
2595
- * Validates a single pattern for common issues.
2596
- */
2597
- const validatePattern = (pattern, seenPatterns) => {
2598
- if (!pattern.trim()) return {
2599
- message: "Empty pattern is not allowed",
2600
- type: "empty_pattern"
2601
- };
2602
- if (seenPatterns.has(pattern)) return {
2603
- message: `Duplicate pattern: "${pattern}"`,
2604
- pattern,
2605
- type: "duplicate"
2606
- };
2607
- seenPatterns.add(pattern);
2608
- TOKEN_INSIDE_BRACES.lastIndex = 0;
2609
- for (const match of pattern.matchAll(TOKEN_INSIDE_BRACES)) {
2610
- const name = match[1];
2611
- if (!KNOWN_TOKENS.has(name)) return {
2612
- message: `Unknown token: {{${name}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
2613
- suggestion: "Check spelling or use a known token",
2614
- token: name,
2615
- type: "unknown_token"
2616
- };
2617
- }
2618
- for (const match of pattern.matchAll(buildBareTokenRegex())) {
2619
- const [full, name] = match;
2620
- const idx = match.index;
2621
- if (pattern.slice(Math.max(0, idx - 2), idx) !== "{{" || pattern.slice(idx + full.length, idx + full.length + 2) !== "}}") return {
2622
- message: `Token "${name}" appears to be missing {{}}. Did you mean "{{${full}}}"?`,
2623
- suggestion: `{{${full}}}`,
2624
- token: name,
2625
- type: "missing_braces"
2626
- };
2627
- }
2628
- };
2629
- /**
2630
- * Validates an array of patterns, returning parallel array of issues.
2631
- */
2632
- const validatePatternArray = (patterns) => {
2633
- const seen = /* @__PURE__ */ new Set();
2634
- const issues = patterns.map((p) => validatePattern(p, seen));
2635
- return issues.some(Boolean) ? issues : void 0;
2636
- };
2637
- const applyRulePatternValidation = (result, key, patterns) => {
2638
- if (!patterns) return false;
2639
- const issues = validatePatternArray(patterns);
2640
- if (!issues) return false;
2641
- result[key] = issues;
2642
- return true;
2643
- };
2644
- const validateTemplateRule = (rule, result) => {
2645
- if (rule.template === void 0) return false;
2646
- const issue = validatePattern(rule.template, /* @__PURE__ */ new Set());
2647
- if (!issue) return false;
2648
- result.template = issue;
2649
- return true;
2650
- };
2651
- const validateRegexRule = (rule, result) => {
2652
- if (rule.regex === void 0) return false;
2653
- if (!rule.regex.trim()) {
2654
- result.regex = {
2655
- message: "Empty pattern is not allowed",
2656
- type: "empty_pattern"
2657
- };
2658
- return true;
2659
- }
2660
- try {
2661
- new RegExp(rule.regex, "u");
2662
- return false;
2663
- } catch (error) {
2664
- result.regex = {
2665
- message: error instanceof Error ? error.message : String(error),
2666
- pattern: rule.regex,
2667
- type: "invalid_regex"
2668
- };
2669
- return true;
2670
- }
2671
- };
2672
- const invalidDictionaryEntryIssue = (message) => ({
2673
- message,
2674
- type: "invalid_option"
2675
- });
2676
- const validateDictionaryEntryRule = (rule, result) => {
2677
- if (!("dictionaryEntry" in rule) || !rule.dictionaryEntry) return false;
2678
- const issues = {};
2679
- const { allowCommaSeparated, allowParenthesized, allowWhitespaceBeforeColon, captureName, maxLetters, midLineSubentries, minLetters, stopWords } = rule.dictionaryEntry;
2680
- if (!Array.isArray(stopWords) || stopWords.some((word) => typeof word !== "string" || !word.trim())) issues.stopWords = invalidDictionaryEntryIssue("stopWords must be a string[] with non-empty entries");
2681
- if (allowCommaSeparated !== void 0 && typeof allowCommaSeparated !== "boolean") issues.allowCommaSeparated = invalidDictionaryEntryIssue("allowCommaSeparated must be a boolean");
2682
- if (allowParenthesized !== void 0 && typeof allowParenthesized !== "boolean") issues.allowParenthesized = invalidDictionaryEntryIssue("allowParenthesized must be a boolean");
2683
- if (allowWhitespaceBeforeColon !== void 0 && typeof allowWhitespaceBeforeColon !== "boolean") issues.allowWhitespaceBeforeColon = invalidDictionaryEntryIssue("allowWhitespaceBeforeColon must be a boolean");
2684
- if (midLineSubentries !== void 0 && typeof midLineSubentries !== "boolean") issues.midLineSubentries = invalidDictionaryEntryIssue("midLineSubentries must be a boolean");
2685
- if (captureName !== void 0 && !captureName.match(/^[A-Za-z_]\w*$/)) issues.captureName = invalidDictionaryEntryIssue(`captureName must match /^[A-Za-z_]\\w*$/, got "${captureName}"`);
2686
- if (minLetters !== void 0 && (!Number.isInteger(minLetters) || minLetters < 1)) issues.minLetters = invalidDictionaryEntryIssue("minLetters must be an integer >= 1");
2687
- if (maxLetters !== void 0 && (!Number.isInteger(maxLetters) || maxLetters < (minLetters ?? 2))) issues.maxLetters = invalidDictionaryEntryIssue(`maxLetters must be an integer >= ${minLetters ?? 2}`);
2688
- if (Object.keys(issues).length === 0) return false;
2689
- result.dictionaryEntry = issues;
2690
- return true;
2691
- };
2692
- const formatValidationIssue = (_type, issue, loc) => {
2693
- if (!issue) return null;
2694
- if (issue.type === "missing_braces") return `${loc}: Missing {{}} around token "${issue.token}"`;
2695
- if (issue.type === "unknown_token") return `${loc}: Unknown token "{{${issue.token}}}"`;
2696
- if (issue.type === "duplicate") return `${loc}: Duplicate pattern "${issue.pattern}"`;
2697
- if (issue.type === "invalid_regex") return `${loc}: Invalid regex (${issue.message})`;
2698
- return `${loc}: ${issue.message || issue.type}`;
2699
- };
2700
- /**
2701
- * Validates split rules for common pattern issues.
2702
- *
2703
- * Checks for:
2704
- * - Missing `{{}}` around known token names (e.g., `raqms:num` instead of `{{raqms:num}}`)
2705
- * - Unknown token names inside `{{}}` (e.g., `{{nonexistent}}`)
2706
- * - Duplicate patterns within the same rule
2707
- *
2708
- * @param rules - Array of split rules to validate
2709
- * @returns Array parallel to input with validation results (undefined if no issues)
2710
- *
2711
- * @example
2712
- * const issues = validateRules([
2713
- * { lineStartsAfter: ['raqms:num'] }, // Missing braces
2714
- * { lineStartsWith: ['{{unknown}}'] }, // Unknown token
2715
- * ]);
2716
- * // issues[0]?.lineStartsAfter?.[0]?.type === 'missing_braces'
2717
- * // issues[1]?.lineStartsWith?.[0]?.type === 'unknown_token'
2718
- */
2719
- const validateRules = (rules) => rules.map((rule) => {
2720
- const result = {};
2721
- const startsWithIssues = applyRulePatternValidation(result, "lineStartsWith", rule.lineStartsWith);
2722
- const startsAfterIssues = applyRulePatternValidation(result, "lineStartsAfter", rule.lineStartsAfter);
2723
- const endsWithIssues = applyRulePatternValidation(result, "lineEndsWith", rule.lineEndsWith);
2724
- const templateIssues = validateTemplateRule(rule, result);
2725
- const regexIssues = validateRegexRule(rule, result);
2726
- const dictionaryEntryIssues = validateDictionaryEntryRule(rule, result);
2727
- return startsWithIssues || startsAfterIssues || endsWithIssues || templateIssues || regexIssues || dictionaryEntryIssues ? result : void 0;
2728
- });
2729
- /**
2730
- * Formats a validation result array into a list of human-readable error messages.
2731
- *
2732
- * Useful for displaying validation errors in UIs.
2733
- *
2734
- * @param results - The result array from `validateRules()`
2735
- * @returns Array of formatted error strings
2736
- *
2737
- * @example
2738
- * const issues = validateRules(rules);
2739
- * const errors = formatValidationReport(issues);
2740
- * // ["Rule 1, lineStartsWith: Missing {{}} around token..."]
2741
- */
2742
- const formatValidationReport = (results) => results.flatMap((result, i) => {
2743
- if (!result) return [];
2744
- return Object.entries(result).flatMap(([type, issues]) => formatValidationIssues(type, issues, i + 1));
2745
- });
2746
- const formatValidationIssues = (type, issues, ruleNumber) => {
2747
- if (type === "dictionaryEntry" && issues && typeof issues === "object" && !Array.isArray(issues)) return Object.entries(issues).map(([field, issue]) => formatValidationIssue(type, issue, `Rule ${ruleNumber}, ${type}.${field}`)).filter((msg) => msg !== null);
2748
- return (Array.isArray(issues) ? issues : [issues]).map((issue) => formatValidationIssue(type, issue, `Rule ${ruleNumber}, ${type}`)).filter((msg) => msg !== null);
2749
- };
2750
- //#endregion
2751
- //#region src/segmentation/breakpoint-processor.ts
2752
- const buildPageIdToIndexMap = (pageIds) => new Map(pageIds.map((id, i) => [id, i]));
2753
- const buildNormalizedPagesMap = (pages, normalizedContent) => {
2754
- const normalizedPages = /* @__PURE__ */ new Map();
2755
- for (let i = 0; i < pages.length; i++) {
2756
- const content = normalizedContent[i];
2757
- normalizedPages.set(pages[i].id, {
2758
- content,
2759
- index: i,
2760
- length: content.length
2761
- });
2762
- }
2763
- return normalizedPages;
2764
- };
2765
- const buildCumulativeOffsets = (pageIds, normalizedPages) => {
2766
- const cumulativeOffsets = [0];
2767
- let totalOffset = 0;
2768
- for (let i = 0; i < pageIds.length; i++) {
2769
- const pageData = normalizedPages.get(pageIds[i]);
2770
- totalOffset += pageData?.length ?? 0;
2771
- if (i < pageIds.length - 1) totalOffset += 1;
2772
- cumulativeOffsets.push(totalOffset);
2773
- }
2774
- return cumulativeOffsets;
2775
- };
2776
- const hasAnyExclusionsInRange = (expandedBreakpoints, pageIds, fromIdx, toIdx) => expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, fromIdx, toIdx));
2777
- const computeWindowEndIdx = (currentFromIdx, toIdx, pageIds, maxPages) => {
2778
- const maxWindowPageId = pageIds[currentFromIdx] + maxPages;
2779
- let windowEndIdx = currentFromIdx;
2780
- for (let i = currentFromIdx; i <= toIdx; i++) if (pageIds[i] <= maxWindowPageId) windowEndIdx = i;
2781
- else break;
2782
- return windowEndIdx;
2783
- };
2784
- const computeRemainingSpan = (currentFromIdx, toIdx, pageIds) => pageIds[toIdx] - pageIds[currentFromIdx];
2785
- const createFinalSegment = (remainingContent, currentFromIdx, toIdx, pageIds, meta, includeMeta) => createSegment(remainingContent, pageIds[currentFromIdx], currentFromIdx !== toIdx ? pageIds[toIdx] : void 0, includeMeta ? meta : void 0);
2786
- /**
2787
- * Computes the actual start and end page indices for a piece using
2788
- * precomputed boundary positions and binary search.
2789
- *
2790
- * @param pieceStartPos - Start position of the piece in the full segment content
2791
- * @param pieceEndPos - End position (exclusive) of the piece
2792
- * @param boundaryPositions - Precomputed boundary positions from buildBoundaryPositions
2793
- * @param baseFromIdx - Base page index (boundaryPositions[0] corresponds to pageIds[baseFromIdx])
2794
- * @param toIdx - Maximum page index
2795
- * @returns Object with actualStartIdx and actualEndIdx
2796
- */
2797
- const computePiecePages = (pieceStartPos, pieceEndPos, boundaryPositions, baseFromIdx, toIdx) => {
2798
- const actualStartIdx = findPageIndexForPosition(pieceStartPos, boundaryPositions, baseFromIdx);
2799
- const endPos = Math.max(pieceStartPos, pieceEndPos - 1);
2800
- return {
2801
- actualEndIdx: Math.min(findPageIndexForPosition(endPos, boundaryPositions, baseFromIdx), toIdx),
2802
- actualStartIdx
2803
- };
2804
- };
2805
- const computeNextFromIdx = (remainingContent, actualEndIdx, toIdx, pageIds, normalizedPages) => {
2806
- let nextFromIdx = actualEndIdx;
2807
- if (remainingContent && actualEndIdx + 1 <= toIdx) {
2808
- const nextPageData = normalizedPages.get(pageIds[actualEndIdx + 1]);
2809
- if (nextPageData) {
2810
- const nextPrefix = nextPageData.content.slice(0, 30);
2811
- const remainingPrefix = remainingContent.trimStart().slice(0, 30);
2812
- if (nextPrefix && (remainingContent.startsWith(nextPrefix) || nextPageData.content.startsWith(remainingPrefix))) nextFromIdx = actualEndIdx + 1;
2813
- }
2814
- }
2815
- return nextFromIdx;
2816
- };
2817
- const createPieceSegment = (pieceContent, actualStartIdx, actualEndIdx, pageIds, meta, includeMeta) => createSegment(pieceContent, pageIds[actualStartIdx], actualEndIdx > actualStartIdx ? pageIds[actualEndIdx] : void 0, includeMeta ? meta : void 0);
2818
- /**
2819
- * Finds the break offset within a window, trying exclusions first, then patterns.
2820
- *
2821
- * @returns Break offset relative to remainingContent, or windowEndPosition as fallback
2822
- */
2823
- const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer, maxContentLength) => {
2824
- if (hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx)) {
2825
- const exclusionBreak = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
2826
- if (exclusionBreak > 0) return { breakOffset: exclusionBreak };
2827
- }
2828
- const patternMatch = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
2829
- expandedBreakpoints,
2830
- normalizedPages,
2831
- pageIds,
2832
- prefer
2833
- }, maxContentLength);
2834
- if (patternMatch && patternMatch.breakPos > 0) return {
2835
- breakOffset: patternMatch.breakPos,
2836
- breakpointIndex: patternMatch.breakpointIndex,
2837
- breakpointRule: patternMatch.rule,
2838
- contentLengthSplit: patternMatch.contentLengthSplit,
2839
- wordIndex: patternMatch.wordIndex
2840
- };
2841
- if (windowEndPosition < remainingContent.length) {
2842
- const safeOffset = findSafeBreakPosition(remainingContent, windowEndPosition);
2843
- if (safeOffset !== -1) return {
2844
- breakOffset: safeOffset,
2845
- contentLengthSplit: maxContentLength ? {
2846
- maxContentLength,
2847
- reason: "whitespace"
2848
- } : void 0
2849
- };
2850
- return {
2851
- breakOffset: adjustForUnicodeBoundary(remainingContent, windowEndPosition),
2852
- contentLengthSplit: maxContentLength ? {
2853
- maxContentLength,
2854
- reason: "unicode_boundary"
2855
- } : void 0
2856
- };
2857
- }
2858
- return { breakOffset: windowEndPosition };
2859
- };
2860
- /**
2861
- * Advances cursor position past any leading whitespace.
2862
- */
2863
- const skipWhitespace = (content, startPos) => {
2864
- let pos = startPos;
2865
- while (pos < content.length && /\s/.test(content[pos])) pos++;
2866
- return pos;
2867
- };
2868
- /**
2869
- * Validates that cumulative offsets match actual content length within a tolerance.
2870
- * Required to detect if structural rules (like `lineStartsAfter`) have stripped content
2871
- * which would make offset-based calculations inaccurate.
2872
- */
2873
- const checkFastPathAlignment = (cumulativeOffsets, fullContent, fromIdx, toIdx, pageCount, logger) => {
2874
- const expectedLength = (cumulativeOffsets[toIdx + 1] ?? fullContent.length) - (cumulativeOffsets[fromIdx] ?? 0);
2875
- const driftTolerance = Math.max(100, fullContent.length * .01);
2876
- const isAligned = Math.abs(expectedLength - fullContent.length) <= driftTolerance;
2877
- if (!isAligned && pageCount >= 1e3) logger?.warn?.("[breakpoints] Offset drift detected in fast-path candidate, falling back to slow path", {
2878
- actualLength: fullContent.length,
2879
- drift: Math.abs(expectedLength - fullContent.length),
2880
- expectedLength,
2881
- pageCount
2882
- });
2883
- return isAligned;
2884
- };
2885
- /**
2886
- * Handles the special optimized case for maxPages=0 (1 page per segment).
2887
- * This is O(n) and safer than offset arithmetic as it uses source pages directly.
2888
- */
2889
- const processTrivialFastPath = (fromIdx, toIdx, pageIds, normalizedPages, pageCount, originalMeta, debugMetaKey, logger) => {
2890
- logger?.debug?.("[breakpoints] Using trivial per-page fast-path (maxPages=0)", {
2891
- fromIdx,
2892
- pageCount,
2893
- toIdx
2894
- });
2895
- const result = [];
2896
- for (let i = fromIdx; i <= toIdx; i++) {
2897
- const pageData = normalizedPages.get(pageIds[i]);
2898
- if (pageData?.content.trim()) {
2899
- const meta = getSegmentMetaWithDebug(i === fromIdx, debugMetaKey, originalMeta, null);
2900
- const seg = createSegment(pageData.content.trim(), pageIds[i], void 0, meta);
2901
- if (seg) result.push(seg);
2902
- }
2903
- }
2904
- return result;
2905
- };
2906
- /**
2907
- * Handles fast-path segmentation for maxPages > 0 using cumulative offsets.
2908
- * Avoids O(n²) string searching but requires accurate offsets.
2909
- */
2910
- const buildFastPathRawContent = (fullContent, baseOffset, cumulativeOffsets, segStart, segEnd, toIdx) => {
2911
- const startOffset = Math.max(0, (cumulativeOffsets[segStart] ?? 0) - baseOffset);
2912
- const endOffset = segEnd < toIdx ? Math.max(0, (cumulativeOffsets[segEnd + 1] ?? fullContent.length) - baseOffset) : fullContent.length;
2913
- return fullContent.slice(startOffset, endOffset).trim();
2914
- };
2915
- const buildFastPathSegment = (fullContent, baseOffset, cumulativeOffsets, segStart, segEnd, fromIdx, toIdx, pageIds, originalMeta, debugMetaKey) => {
2916
- const rawContent = buildFastPathRawContent(fullContent, baseOffset, cumulativeOffsets, segStart, segEnd, toIdx);
2917
- if (!rawContent) return null;
2918
- const meta = getSegmentMetaWithDebug(segStart === fromIdx, debugMetaKey, originalMeta, null);
2919
- const seg = {
2920
- content: rawContent,
2921
- from: pageIds[segStart]
2922
- };
2923
- if (segEnd > segStart) seg.to = pageIds[segEnd];
2924
- if (meta) seg.meta = meta;
2925
- return seg;
2926
- };
2927
- const processOffsetFastPath = (fullContent, fromIdx, toIdx, pageIds, cumulativeOffsets, maxPages, originalMeta, debugMetaKey, logger) => {
2928
- const result = [];
2929
- const pageCount = toIdx - fromIdx + 1;
2930
- logger?.debug?.("[breakpoints] Using offset-based fast-path for large segment", {
2931
- fromIdx,
2932
- maxPages,
2933
- pageCount,
2934
- toIdx
2935
- });
2936
- const baseOffset = cumulativeOffsets[fromIdx] ?? 0;
2937
- let segStart = fromIdx;
2938
- const needsPeel = (startIdx) => pageIds[toIdx] - pageIds[startIdx] > maxPages;
2939
- for (; segStart <= toIdx && needsPeel(segStart); segStart++) {
2940
- const seg = buildFastPathSegment(fullContent, baseOffset, cumulativeOffsets, segStart, segStart, fromIdx, toIdx, pageIds, originalMeta, debugMetaKey);
2941
- if (seg) result.push(seg);
2942
- }
2943
- if (segStart <= toIdx) {
2944
- const seg = buildFastPathSegment(fullContent, baseOffset, cumulativeOffsets, segStart, toIdx, fromIdx, toIdx, pageIds, originalMeta, debugMetaKey);
2945
- if (seg) result.push(seg);
2946
- }
2947
- return result;
2948
- };
2949
- /**
2950
- * Checks if the remaining content fits within paged/length limits.
2951
- * If so, pushes the final segment and returns true.
2952
- *
2953
- * @param actualRemainingEndIdx - The actual end page index of the remaining content
2954
- * (computed from boundaryPositions), NOT the original segment's toIdx. This is critical
2955
- * for maxPages=0 scenarios where remaining content may end before toIdx.
2956
- */
2957
- const handleOversizedSegmentFit = (remainingContent, currentFromIdx, actualRemainingEndIdx, pageIds, expandedBreakpoints, maxPages, maxContentLength, isFirstPiece, debugMetaKey, originalMeta, lastBreakpoint, result) => {
2958
- const remainingSpan = computeRemainingSpan(currentFromIdx, actualRemainingEndIdx, pageIds);
2959
- const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, actualRemainingEndIdx);
2960
- const fitsInPages = remainingSpan <= maxPages;
2961
- const fitsInLength = !maxContentLength || remainingContent.length <= maxContentLength;
2962
- if (fitsInPages && fitsInLength && !remainingHasExclusions) {
2963
- const includeMeta = isFirstPiece || Boolean(debugMetaKey);
2964
- const finalSeg = createFinalSegment(remainingContent, currentFromIdx, actualRemainingEndIdx, pageIds, getSegmentMetaWithDebug(isFirstPiece, debugMetaKey, originalMeta, lastBreakpoint), includeMeta);
2965
- if (finalSeg) result.push(finalSeg);
2966
- return true;
2967
- }
2968
- return false;
2969
- };
2970
- /**
2971
- * Builds metadata for a segment piece, optionally including debug info.
2972
- */
2973
- const getSegmentMetaWithDebug = (isFirstPiece, debugMetaKey, originalMeta, lastBreakpoint, contentLengthSplit) => {
2974
- if (!(isFirstPiece || Boolean(debugMetaKey))) return;
2975
- let meta = isFirstPiece ? originalMeta : void 0;
2976
- if (debugMetaKey) {
2977
- if (lastBreakpoint) meta = mergeDebugIntoMeta(meta, debugMetaKey, buildBreakpointDebugPatch(lastBreakpoint.breakpointIndex, lastBreakpoint.rule, lastBreakpoint.wordIndex));
2978
- if (contentLengthSplit) meta = mergeDebugIntoMeta(meta, debugMetaKey, { contentLengthSplit: {
2979
- maxContentLength: contentLengthSplit.maxContentLength,
2980
- splitReason: contentLengthSplit.reason
2981
- } });
2982
- }
2983
- return meta;
2984
- };
2985
- /**
2986
- * Calculates window end position, capped by maxContentLength if present.
2987
- */
2988
- const getWindowEndPosition = (remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, maxContentLength, logger) => {
2989
- const pos = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger);
2990
- return maxContentLength ? Math.min(pos, maxContentLength) : pos;
2991
- };
2992
- /**
2993
- * Advances cursorPos and currentFromIdx for the next iteration.
2994
- */
2995
- const advanceCursorAndIndex = (fullContent, breakPos, actualEndIdx, toIdx, pageIds, normalizedPages) => {
2996
- const nextCursorPos = skipWhitespace(fullContent, breakPos);
2997
- return {
2998
- currentFromIdx: computeNextFromIdx(fullContent.slice(nextCursorPos, nextCursorPos + 500), actualEndIdx, toIdx, pageIds, normalizedPages),
2999
- cursorPos: nextCursorPos
3000
- };
3001
- };
3002
- const computeIterationWindow = (fullContent, cursorPos, currentFromIdx, fromIdx, toIdx, pageIds, boundaryPositions, maxPages, maxContentLength) => {
3003
- const windowEndIdx = computeWindowEndIdx(currentFromIdx, toIdx, pageIds, maxPages);
3004
- const windowEndAbsPos = boundaryPositions[windowEndIdx - fromIdx + 1] ?? fullContent.length;
3005
- const sliceEndByPages = Math.min(fullContent.length, windowEndAbsPos + 4e3);
3006
- const sliceEndByLength = maxContentLength ? Math.min(fullContent.length, cursorPos + maxContentLength + 4e3) : fullContent.length;
3007
- const sliceEnd = Math.max(cursorPos + 1, Math.min(sliceEndByPages, sliceEndByLength));
3008
- return {
3009
- remainingContent: fullContent.slice(cursorPos, sliceEnd),
3010
- sliceEnd,
3011
- windowEndIdx
3012
- };
3013
- };
3014
- const computeWindowEndPositionForIteration = (remainingContent, cursorPos, currentFromIdx, fromIdx, windowEndIdx, toIdx, pageIds, boundaryPositions, normalizedPages, cumulativeOffsets, maxPages, maxContentLength, logger) => {
3015
- if (maxPages === 0) {
3016
- const nextPageStartPos = boundaryPositions[currentFromIdx - fromIdx + 1] ?? Number.POSITIVE_INFINITY;
3017
- const remainingInCurrentPage = Math.max(0, nextPageStartPos - cursorPos);
3018
- return Math.min(maxContentLength ? Math.min(remainingInCurrentPage, maxContentLength) : remainingInCurrentPage, remainingContent.length);
3019
- }
3020
- const pos = getWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, maxContentLength, logger);
3021
- return Math.min(pos, remainingContent.length);
3022
- };
3023
- const ensureProgressingBreakOffset = (foundBreakOffset, remainingContent, cursorPos, maxContentLength, logger) => {
3024
- if (foundBreakOffset > 0) return foundBreakOffset;
3025
- const fallbackPos = maxContentLength ? Math.min(maxContentLength, remainingContent.length) : 1;
3026
- const breakOffset = Math.max(1, fallbackPos);
3027
- logger?.warn?.("[breakpoints] No progress from findBreakOffsetForWindow; forcing forward movement", {
3028
- breakOffset,
3029
- cursorPos
3030
- });
3031
- return breakOffset;
3032
- };
3033
- const updateLastBreakpointFromFound = (found, lastBreakpoint) => {
3034
- if (found.breakpointIndex !== void 0 && found.breakpointRule) return {
3035
- breakpointIndex: found.breakpointIndex,
3036
- rule: found.breakpointRule,
3037
- wordIndex: found.wordIndex
3038
- };
3039
- return lastBreakpoint;
3040
- };
3041
- const appendPieceAndAdvance = (fullContent, cursorPos, breakPos, pieceContent, currentFromIdx, fromIdx, toIdx, pageIds, boundaryPositions, normalizedPages, maxPages, isFirstPiece, debugMetaKey, originalMeta, lastBreakpoint, result, logger, contentLengthSplit) => {
3042
- let { actualEndIdx, actualStartIdx } = computePiecePages(cursorPos, breakPos, boundaryPositions, fromIdx, toIdx);
3043
- if (actualStartIdx < currentFromIdx) {
3044
- logger?.warn?.("[breakpoints] Page attribution drift detected; clamping actualStartIdx", {
3045
- actualStartIdx,
3046
- currentFromIdx
3047
- });
3048
- actualStartIdx = currentFromIdx;
3049
- }
3050
- if (maxPages === 0) {
3051
- actualEndIdx = Math.min(actualEndIdx, currentFromIdx);
3052
- actualStartIdx = Math.min(actualStartIdx, currentFromIdx);
3053
- } else if (maxPages > 0) {
3054
- const maxAllowedEndIdx = computeWindowEndIdx(actualStartIdx, toIdx, pageIds, maxPages);
3055
- actualEndIdx = Math.min(actualEndIdx, maxAllowedEndIdx);
3056
- }
3057
- const meta = getSegmentMetaWithDebug(isFirstPiece, debugMetaKey, originalMeta, lastBreakpoint, contentLengthSplit);
3058
- const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, meta, true);
3059
- if (pieceSeg) result.push(pieceSeg);
3060
- const next = advanceCursorAndIndex(fullContent, breakPos, actualEndIdx, toIdx, pageIds, normalizedPages);
3061
- let nextFromIdx = next.currentFromIdx;
3062
- if (maxPages === 0) nextFromIdx = findPageIndexForPosition(next.cursorPos, boundaryPositions, fromIdx);
3063
- return {
3064
- currentFromIdx: nextFromIdx,
3065
- cursorPos: next.cursorPos
3066
- };
3067
- };
3068
- const tryProcessOversizedSegmentFastPath = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, logger, debugMetaKey, maxContentLength) => {
3069
- const fullContent = segment.content;
3070
- const pageCount = toIdx - fromIdx + 1;
3071
- const isAligned = checkFastPathAlignment(cumulativeOffsets, fullContent, fromIdx, toIdx, pageCount, logger);
3072
- const isPageBoundaryOnly = expandedBreakpoints.every((bp) => bp.regex === null && bp.excludeSet.size === 0 && bp.skipWhenRegex === null);
3073
- if (pageCount < 1e3 || !isAligned || !isPageBoundaryOnly || maxContentLength || debugMetaKey) return null;
3074
- if (maxPages === 0) return processTrivialFastPath(fromIdx, toIdx, pageIds, normalizedPages, pageCount, segment.meta, debugMetaKey, logger);
3075
- return processOffsetFastPath(fullContent, fromIdx, toIdx, pageIds, cumulativeOffsets, maxPages, segment.meta, debugMetaKey, logger);
3076
- };
3077
- /**
3078
- * For maxPages=0 with maxContentLength: if current page's remaining content fits,
3079
- * create a segment and advance to next page without applying breakpoints.
3080
- */
3081
- const tryHandleCurrentPageFit = (fullContent, cursorPos, currentFromIdx, fromIdx, actualRemainingEndIdx, boundaryPositions, pageIds, expandedBreakpoints, maxPages, maxContentLength, isFirstPiece, debugMetaKey, segmentMeta, lastBreakpoint, result) => {
3082
- if (maxPages !== 0 || !maxContentLength || currentFromIdx >= actualRemainingEndIdx) return { handled: false };
3083
- const currentPageEndPos = boundaryPositions[currentFromIdx - fromIdx + 1] ?? fullContent.length;
3084
- const currentPageRemainingContent = fullContent.slice(cursorPos, currentPageEndPos).trim();
3085
- if (!currentPageRemainingContent) return { handled: false };
3086
- const currentPageFitsInLength = currentPageRemainingContent.length <= maxContentLength;
3087
- const currentPageHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, currentFromIdx);
3088
- if (!currentPageFitsInLength || currentPageHasExclusions) return { handled: false };
3089
- const pageBoundaryIdx = expandedBreakpoints.findIndex((bp) => bp.regex === null);
3090
- const pageBoundaryBreakpoint = pageBoundaryIdx >= 0 ? {
3091
- breakpointIndex: pageBoundaryIdx,
3092
- rule: { pattern: "" }
3093
- } : lastBreakpoint;
3094
- const includeMeta = isFirstPiece || Boolean(debugMetaKey);
3095
- const meta = getSegmentMetaWithDebug(isFirstPiece, debugMetaKey, segmentMeta, pageBoundaryBreakpoint);
3096
- const seg = createSegment(currentPageRemainingContent, pageIds[currentFromIdx], void 0, includeMeta ? meta : void 0);
3097
- if (seg) result.push(seg);
3098
- let newCursorPos = currentPageEndPos;
3099
- while (newCursorPos < fullContent.length && /\s/.test(fullContent[newCursorPos])) newCursorPos++;
3100
- return {
3101
- handled: true,
3102
- newCursorPos,
3103
- newFromIdx: currentFromIdx + 1,
3104
- newLastBreakpoint: pageBoundaryBreakpoint
3105
- };
3106
- };
3107
- const processOversizedSegmentIterative = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey, maxContentLength) => {
3108
- const result = [];
3109
- const fullContent = segment.content;
3110
- const pageCount = toIdx - fromIdx + 1;
3111
- logger?.debug?.("[breakpoints] processOversizedSegment: Using iterative path", {
3112
- contentLength: fullContent.length,
3113
- fromIdx,
3114
- maxContentLength,
3115
- maxPages,
3116
- pageCount,
3117
- toIdx
3118
- });
3119
- let cursorPos = 0;
3120
- let currentFromIdx = fromIdx;
3121
- let isFirstPiece = true;
3122
- let lastBreakpoint = null;
3123
- const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger);
3124
- logger?.debug?.("[breakpoints] boundaryPositions built", {
3125
- boundaryPositions,
3126
- fromIdx,
3127
- fullContentLength: fullContent.length,
3128
- toIdx
3129
- });
3130
- const MAX_SAFE_ITERATIONS = 1e5;
3131
- let didHitMaxIterations = true;
3132
- for (let i = 1; i <= MAX_SAFE_ITERATIONS; i++) {
3133
- if (cursorPos >= fullContent.length || currentFromIdx > toIdx) {
3134
- didHitMaxIterations = false;
3135
- break;
3136
- }
3137
- const { remainingContent, windowEndIdx } = computeIterationWindow(fullContent, cursorPos, currentFromIdx, fromIdx, toIdx, pageIds, boundaryPositions, maxPages, maxContentLength);
3138
- if (!remainingContent.trim()) {
3139
- didHitMaxIterations = false;
3140
- break;
3141
- }
3142
- const actualRemainingContent = fullContent.slice(cursorPos);
3143
- const actualEndPos = Math.max(cursorPos, fullContent.length - 1);
3144
- const actualRemainingEndIdx = Math.min(findPageIndexForPosition(actualEndPos, boundaryPositions, fromIdx), toIdx);
3145
- const currentPageFit = tryHandleCurrentPageFit(fullContent, cursorPos, currentFromIdx, fromIdx, actualRemainingEndIdx, boundaryPositions, pageIds, expandedBreakpoints, maxPages, maxContentLength, isFirstPiece, debugMetaKey, segment.meta, lastBreakpoint, result);
3146
- if (currentPageFit.handled) {
3147
- cursorPos = currentPageFit.newCursorPos;
3148
- currentFromIdx = currentPageFit.newFromIdx;
3149
- lastBreakpoint = currentPageFit.newLastBreakpoint;
3150
- isFirstPiece = false;
3151
- continue;
3152
- }
3153
- if (handleOversizedSegmentFit(actualRemainingContent, currentFromIdx, actualRemainingEndIdx, pageIds, expandedBreakpoints, maxPages, maxContentLength, isFirstPiece, debugMetaKey, segment.meta, lastBreakpoint, result)) {
3154
- didHitMaxIterations = false;
3155
- break;
3156
- }
3157
- const windowEndPosition = computeWindowEndPositionForIteration(remainingContent, cursorPos, currentFromIdx, fromIdx, windowEndIdx, toIdx, pageIds, boundaryPositions, normalizedPages, cumulativeOffsets, maxPages, maxContentLength, logger);
3158
- logger?.trace?.(`[breakpoints] iteration=${i}`, {
3159
- currentFromIdx,
3160
- cursorPos,
3161
- windowEndIdx,
3162
- windowEndPosition
3163
- });
3164
- const found = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer, maxContentLength);
3165
- const breakOffset = ensureProgressingBreakOffset(found.breakOffset, remainingContent, cursorPos, maxContentLength, logger);
3166
- lastBreakpoint = updateLastBreakpointFromFound(found, lastBreakpoint);
3167
- const breakPos = cursorPos + breakOffset;
3168
- const pieceContent = fullContent.slice(cursorPos, breakPos).trim();
3169
- if (!pieceContent) {
3170
- cursorPos = breakPos;
3171
- isFirstPiece = false;
3172
- continue;
3173
- }
3174
- const next = appendPieceAndAdvance(fullContent, cursorPos, breakPos, pieceContent, currentFromIdx, fromIdx, toIdx, pageIds, boundaryPositions, normalizedPages, maxPages, isFirstPiece, debugMetaKey, segment.meta, lastBreakpoint, result, logger, found.contentLengthSplit);
3175
- cursorPos = next.cursorPos;
3176
- currentFromIdx = next.currentFromIdx;
3177
- isFirstPiece = false;
3178
- }
3179
- if (didHitMaxIterations) logger?.error?.("[breakpoints] Stopped processing oversized segment: reached MAX_SAFE_ITERATIONS", {
3180
- cursorPos,
3181
- fullContentLength: fullContent.length,
3182
- iterations: MAX_SAFE_ITERATIONS
3183
- });
3184
- logger?.debug?.("[breakpoints] processOversizedSegment: Complete", { resultCount: result.length });
3185
- return result;
3186
- };
3187
- /**
3188
- * Applies breakpoints to oversized segments.
3189
- *
3190
- * Note: This is an internal engine used by `segmentPages()`.
3191
- */
3192
- /**
3193
- * Processes an oversized segment by iterating through the content and
3194
- * breaking it into smaller pieces that fit within maxPages constraints.
3195
- *
3196
- * Uses precomputed boundary positions for O(log n) page attribution lookups.
3197
- */
3198
- const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey, maxContentLength) => {
3199
- const fast = tryProcessOversizedSegmentFastPath(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, logger, debugMetaKey, maxContentLength);
3200
- if (fast) return fast;
3201
- return processOversizedSegmentIterative(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey, maxContentLength);
3202
- };
3203
- const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space", debugMetaKey, maxContentLength, rawPatternProcessor) => {
3204
- const pageIds = pages.map((p) => p.id);
3205
- const pageIdToIndex = buildPageIdToIndexMap(pageIds);
3206
- const normalizedPages = buildNormalizedPagesMap(pages, normalizedContent);
3207
- const cumulativeOffsets = buildCumulativeOffsets(pageIds, normalizedPages);
3208
- const expandedBreakpoints = expandBreakpoints(breakpoints, patternProcessor, rawPatternProcessor);
3209
- const result = [];
3210
- logger?.info?.("Starting breakpoint processing", {
3211
- maxPages,
3212
- segmentCount: segments.length
3213
- });
3214
- logger?.debug?.("[breakpoints] inputSegments", {
3215
- segmentCount: segments.length,
3216
- segments: segments.map((s) => ({
3217
- contentLength: s.content.length,
3218
- from: s.from,
3219
- to: s.to
3220
- }))
3221
- });
3222
- for (const segment of segments) {
3223
- const fromIdx = pageIdToIndex.get(segment.from) ?? -1;
3224
- const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
3225
- const segmentSpan = (segment.to ?? segment.from) - segment.from;
3226
- const hasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, fromIdx, toIdx);
3227
- const fitsInPages = segmentSpan <= maxPages;
3228
- const fitsInLength = !maxContentLength || segment.content.length <= maxContentLength;
3229
- if (fitsInPages && fitsInLength && !hasExclusions) {
3230
- result.push(segment);
3231
- continue;
3232
- }
3233
- logger?.debug?.("[breakpoints] Processing oversized segment", {
3234
- contentLength: segment.content.length,
3235
- from: segment.from,
3236
- hasExclusions,
3237
- pageSpan: toIdx - fromIdx + 1,
3238
- reasonFitsInLength: fitsInLength,
3239
- reasonFitsInPages: fitsInPages,
3240
- to: segment.to
3241
- });
3242
- const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey, maxContentLength);
3243
- result.push(...broken.map((s) => {
3244
- const segFromIdx = pageIdToIndex.get(s.from) ?? -1;
3245
- const segToIdx = s.to !== void 0 ? pageIdToIndex.get(s.to) ?? segFromIdx : segFromIdx;
3246
- if (segFromIdx >= 0 && segToIdx > segFromIdx) return {
3247
- ...s,
3248
- content: applyPageJoinerBetweenPages(s.content, segFromIdx, segToIdx, pageIds, normalizedPages, pageJoiner)
3249
- };
3250
- return s;
3251
- }));
3252
- }
3253
- logger?.info?.("Breakpoint processing completed", { resultCount: result.length });
3254
- return result;
3255
- };
3256
- //#endregion
3257
- //#region src/segmentation/rule-regex.ts
3258
- /**
3259
- * Checks if a regex pattern contains standard (anonymous) capturing groups.
3260
- *
3261
- * Detects standard capturing groups `(...)` while excluding:
3262
- * - Non-capturing groups `(?:...)`
3263
- * - Lookahead assertions `(?=...)` and `(?!...)`
3264
- * - Lookbehind assertions `(?<=...)` and `(?<!...)`
3265
- * - Named groups `(?<name>...)` (start with `(?` so excluded here)
3266
- *
3267
- * NOTE: Named capture groups are still captures, but they're tracked via `captureNames`.
3268
- */
3269
- const hasCapturingGroup = (pattern) => /\((?!\?)/.test(pattern);
3270
- /**
3271
- * Extracts named capture group names from a regex pattern.
3272
- *
3273
- * Parses patterns like `(?<num>[0-9]+)` and returns `['num']`.
3274
- *
3275
- * @example
3276
- * extractNamedCaptureNames('^(?<num>[٠-٩]+)\\s+') // ['num']
3277
- * extractNamedCaptureNames('^(?<a>\\d+)(?<b>\\w+)') // ['a', 'b']
3278
- * extractNamedCaptureNames('^\\d+') // []
3279
- */
3280
- const extractNamedCaptureNames = (pattern) => [...pattern.matchAll(/\(\?<([A-Za-z_]\w*)>/g)].map((m) => m[1]).filter((n) => !n.startsWith("_r") && !n.startsWith("_w"));
3281
- /**
3282
- * Safely compiles a regex pattern, throwing a helpful error if invalid.
3283
- */
3284
- const compileRuleRegex = (pattern) => {
3285
- try {
3286
- return new RegExp(pattern, "gmu");
3287
- } catch (error) {
3288
- throw new Error(`Invalid regex pattern: ${pattern}\n Cause: ${error instanceof Error ? error.message : String(error)}`);
3289
- }
3290
- };
3291
- /**
3292
- * Processes a pattern string by expanding tokens and optionally applying fuzzy matching.
3293
- *
3294
- * Brackets `()[]` outside `{{tokens}}` are auto-escaped.
3295
- */
3296
- const processPattern = (pattern, fuzzy, capturePrefix) => {
3297
- const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0, capturePrefix);
3298
- return {
3299
- captureNames,
3300
- pattern: expanded
3301
- };
3302
- };
3303
- /**
3304
- * Processes a breakpoint pattern by expanding tokens only.
3305
- *
3306
- * Unlike `processPattern`, this does NOT escape brackets because breakpoints
3307
- * are treated as raw regex patterns (like the `regex` rule type).
3308
- * Users have full control over regex syntax including `(?:...)` groups.
3309
- */
3310
- const processBreakpointPattern = (pattern) => {
3311
- const { pattern: expanded } = expandTokensWithCaptures(pattern);
3312
- return expanded;
3313
- };
3314
- /**
3315
- * Builds the raw regex source for a `lineStartsAfter` rule.
3316
- *
3317
- * Expands each pattern through `processPattern()`, combines them into an
3318
- * alternation at the start of a line, and appends a trailing content capture.
3319
- *
3320
- * @param patterns - Template-like line-start markers to match
3321
- * @param fuzzy - Whether Arabic fuzzy matching should be applied during expansion
3322
- * @param capturePrefix - Optional prefix used for internal named captures
3323
- * @returns Regex source plus the named captures extracted from the patterns
3324
- */
3325
- const buildLineStartsAfterRegexSource = (patterns, fuzzy, capturePrefix) => {
3326
- const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
3327
- const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
3328
- return {
3329
- captureNames: processed.flatMap((p) => p.captureNames),
3330
- regex: `^[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*(?:${alternatives})${capturePrefix ? `(?<${capturePrefix}__content>.*)` : "(.*)"}`
3331
- };
3332
- };
3333
- /**
3334
- * Builds the raw regex source for a `lineStartsWith` rule.
3335
- *
3336
- * Expands each pattern through `processPattern()` and combines them into an
3337
- * alternation anchored at the start of a line.
3338
- *
3339
- * @param patterns - Template-like line-start markers to match
3340
- * @param fuzzy - Whether Arabic fuzzy matching should be applied during expansion
3341
- * @param capturePrefix - Optional prefix used for internal named captures
3342
- * @returns Regex source plus the named captures extracted from the patterns
3343
- */
3344
- const buildLineStartsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
3345
- const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
3346
- const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
3347
- return {
3348
- captureNames: processed.flatMap((p) => p.captureNames),
3349
- regex: `^[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*(?:${alternatives})`
3350
- };
3351
- };
3352
- /**
3353
- * Builds the raw regex source for a `lineEndsWith` rule.
3354
- *
3355
- * Expands each pattern through `processPattern()` and combines them into an
3356
- * end-anchored alternation.
3357
- *
3358
- * @param patterns - Template-like line-end markers to match
3359
- * @param fuzzy - Whether Arabic fuzzy matching should be applied during expansion
3360
- * @param capturePrefix - Optional prefix used for internal named captures
3361
- * @returns Regex source plus the named captures extracted from the patterns
3362
- */
3363
- const buildLineEndsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
3364
- const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
3365
- const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
3366
- return {
3367
- captureNames: processed.flatMap((p) => p.captureNames),
3368
- regex: `(?:${alternatives})$`
3369
- };
3370
- };
3371
- /**
3372
- * Builds the raw regex source for a `template` rule.
3373
- *
3374
- * Expands tokens and named captures via `expandTokensWithCaptures()` after
3375
- * applying `escapeTemplateBrackets()` to non-token brackets.
3376
- *
3377
- * @param template - Template string containing optional `{{token}}` markers
3378
- * @param capturePrefix - Optional prefix used for internal named captures
3379
- * @returns Regex source plus the named captures extracted from the template
3380
- */
3381
- const buildTemplateRegexSource = (template, capturePrefix) => {
3382
- const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template), void 0, capturePrefix);
3383
- return {
3384
- captureNames,
3385
- regex: pattern
3386
- };
3387
- };
3388
- const getFuzzyCandidatePatterns = (rule) => [
3389
- ..."lineStartsWith" in rule && Array.isArray(rule.lineStartsWith) ? rule.lineStartsWith : [],
3390
- ..."lineStartsAfter" in rule && Array.isArray(rule.lineStartsAfter) ? rule.lineStartsAfter : [],
3391
- ..."lineEndsWith" in rule && Array.isArray(rule.lineEndsWith) ? rule.lineEndsWith : []
3392
- ];
3393
- const buildLineBasedRuleRegex = (rule, fuzzy, capturePrefix) => {
3394
- if ("lineStartsWith" in rule && Array.isArray(rule.lineStartsWith) && rule.lineStartsWith.length > 0) return buildLineStartsWithRegexSource(rule.lineStartsWith, fuzzy, capturePrefix);
3395
- if ("lineEndsWith" in rule && Array.isArray(rule.lineEndsWith) && rule.lineEndsWith.length > 0) return buildLineEndsWithRegexSource(rule.lineEndsWith, fuzzy, capturePrefix);
3396
- if ("template" in rule && typeof rule.template === "string") return buildTemplateRegexSource(rule.template, capturePrefix);
3397
- if ("dictionaryEntry" in rule && rule.dictionaryEntry) return buildArabicDictionaryEntryRegexSource(rule.dictionaryEntry, capturePrefix);
3398
- return null;
3399
- };
3400
- /**
3401
- * Builds a compiled regex and metadata from a split rule.
3402
- *
3403
- * Behavior mirrors the previous implementation in `segmenter.ts`.
3404
- */
3405
- const buildRuleRegex = (rule, capturePrefix) => {
3406
- const fuzzy = rule.fuzzy ?? shouldDefaultToFuzzy(getFuzzyCandidatePatterns(rule));
3407
- if ("lineStartsAfter" in rule && Array.isArray(rule.lineStartsAfter) && rule.lineStartsAfter.length > 0) {
3408
- const { regex: lsaRegex, captureNames } = buildLineStartsAfterRegexSource(rule.lineStartsAfter, fuzzy, capturePrefix);
3409
- return {
3410
- captureNames,
3411
- regex: compileRuleRegex(lsaRegex),
3412
- usesCapture: true,
3413
- usesLineStartsAfter: true
3414
- };
3415
- }
3416
- const ruleRegexSource = buildLineBasedRuleRegex(rule, fuzzy, capturePrefix);
3417
- let finalRegex = ruleRegexSource?.regex;
3418
- let allCaptureNames = ruleRegexSource?.captureNames ?? [];
3419
- if (!finalRegex && "regex" in rule && typeof rule.regex === "string") finalRegex = rule.regex;
3420
- if (!finalRegex) throw new Error("Rule must specify exactly one pattern type: regex, template, lineStartsWith, lineStartsAfter, lineEndsWith, or dictionaryEntry");
3421
- if (allCaptureNames.length === 0) allCaptureNames = extractNamedCaptureNames(finalRegex);
3422
- return {
3423
- captureNames: allCaptureNames,
3424
- regex: compileRuleRegex(finalRegex),
3425
- usesCapture: hasCapturingGroup(finalRegex),
3426
- usesLineStartsAfter: false
3427
- };
3428
- };
3429
- //#endregion
3430
- //#region src/segmentation/fast-fuzzy-prefix.ts
3431
- /**
3432
- * Fast-path fuzzy prefix matching for common Arabic line-start markers.
3433
- *
3434
- * This exists to avoid running expensive fuzzy-expanded regex alternations over
3435
- * a giant concatenated string. Instead, we match only at known line-start
3436
- * offsets and perform a small deterministic comparison:
3437
- * - Skip Arabic diacritics in the CONTENT
3438
- * - Treat common equivalence groups as equal (ا/آ/أ/إ, ة/ه, ى/ي)
3439
- *
3440
- * This module is intentionally conservative: it only supports "literal"
3441
- * token patterns (plain text alternation via `|`), not general regex.
3442
- */
3443
- const isArabicDiacriticCode = (code) => code >= 1611 && code <= 1618;
3444
- const equivKey = (ch) => {
3445
- switch (ch) {
3446
- case "آ":
3447
- case "أ":
3448
- case "إ": return "ا";
3449
- case "ه": return "ة";
3450
- case "ي": return "ى";
3451
- default: return ch;
3452
- }
3453
- };
3454
- const matchFuzzyLiteralPrefixAt = (content, offset, literal) => {
3455
- let i = offset;
3456
- while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
3457
- for (let j = 0; j < literal.length; j++) {
3458
- const litCh = literal[j];
3459
- while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
3460
- if (i >= content.length || equivKey(content[i]) !== equivKey(litCh)) return null;
3461
- i++;
3462
- }
3463
- while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
3464
- return i;
3465
- };
3466
- const isLiteralOnly = (s) => !/[\\[\]{}()^$.*+?]/.test(s);
3467
- const compileLiteralAlternation = (pattern) => {
3468
- if (!pattern || !isLiteralOnly(pattern)) return null;
3469
- const alternatives = pattern.split("|").map((s) => s.trim()).filter(Boolean);
3470
- return alternatives.length ? { alternatives } : null;
3471
- };
3472
- const compileFastFuzzyTokenRule = (tokenTemplate) => {
3473
- const m = tokenTemplate.match(/^\{\{(\w+)\}\}$/);
3474
- if (!m) return null;
3475
- const token = m[1];
3476
- if (!(token in TOKEN_PATTERNS)) return null;
3477
- const compiled = compileLiteralAlternation(getTokenPattern(token));
3478
- return compiled ? {
3479
- alternatives: compiled.alternatives,
3480
- token
3481
- } : null;
3482
- };
3483
- const matchFastFuzzyTokenAt = (content, offset, compiled) => {
3484
- for (const alt of compiled.alternatives) {
3485
- const end = matchFuzzyLiteralPrefixAt(content, offset, alt);
3486
- if (end !== null) return end;
3487
- }
3488
- return null;
3489
- };
3490
- //#endregion
3491
- //#region src/segmentation/segmenter-rule-utils.ts
3492
- const tryCompileFastFuzzyRule = (rule) => {
3493
- const fuzzyCandidatePatterns = [..."lineStartsWith" in rule ? rule.lineStartsWith : [], ..."lineStartsAfter" in rule ? rule.lineStartsAfter : []];
3494
- if (!(rule.fuzzy ?? shouldDefaultToFuzzy(fuzzyCandidatePatterns))) return null;
3495
- if ("lineStartsWith" in rule && rule.lineStartsWith?.length === 1) {
3496
- const compiled = compileFastFuzzyTokenRule(rule.lineStartsWith[0]);
3497
- if (compiled) return {
3498
- compiled,
3499
- kind: "startsWith"
3500
- };
3501
- }
3502
- if ("lineStartsAfter" in rule && rule.lineStartsAfter?.length === 1) {
3503
- const compiled = compileFastFuzzyTokenRule(rule.lineStartsAfter[0]);
3504
- if (compiled) return {
3505
- compiled,
3506
- kind: "startsAfter"
3507
- };
3508
- }
3509
- return null;
3510
- };
3511
- const isCombinableRule = (rule) => {
3512
- if ("regex" in rule && rule.regex) return extractNamedCaptureNames(rule.regex).length === 0 && !/\\[1-9]/.test(rule.regex) && !hasCapturingGroup(rule.regex);
3513
- return true;
3514
- };
3515
- const partitionRulesForMatching = (rules) => {
3516
- const combinableRules = [];
3517
- const standaloneRules = [];
3518
- const fastFuzzyRules = [];
3519
- for (let index = 0; index < rules.length; index++) {
3520
- const rule = rules[index];
3521
- const fuzzyComp = tryCompileFastFuzzyRule(rule);
3522
- if (fuzzyComp) {
3523
- fastFuzzyRules.push({
3524
- compiled: fuzzyComp.compiled,
3525
- kind: fuzzyComp.kind,
3526
- rule,
3527
- ruleIndex: index
3528
- });
3529
- continue;
3530
- }
3531
- if (isCombinableRule(rule)) combinableRules.push({
3532
- index,
3533
- prefix: `r${index}_`,
3534
- rule
3535
- });
3536
- else standaloneRules.push({
3537
- index,
3538
- rule
3539
- });
3540
- }
3541
- return {
3542
- combinableRules,
3543
- fastFuzzyRules,
3544
- standaloneRules
3545
- };
3546
- };
3547
- const STRONG_SENTENCE_TERMINATORS = /[.!?؟؛۔…]$/u;
3548
- const TRAILING_PAGE_WRAP_NOISE = /[\s\u0660-\u0669\d«»"“”'‘’()[\]{}<>]+$/u;
3549
- const TRAILING_WORD_DELIMITERS = /[\s\u0660-\u0669\d«»"“”'‘’()[\]{}<>.,!?؟؛،:]+$/u;
3550
- const ARABIC_WORD_REGEX = new RegExp(ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, "gu");
3551
- const trimTrailingPageWrapNoise = (text) => {
3552
- let trimmed = text.trimEnd();
3553
- while (trimmed !== trimmed.replace(TRAILING_PAGE_WRAP_NOISE, "")) trimmed = trimmed.replace(TRAILING_PAGE_WRAP_NOISE, "");
3554
- return trimmed;
3555
- };
3556
- const endsWithStrongSentenceTerminator = (pageContent) => {
3557
- return STRONG_SENTENCE_TERMINATORS.test(trimTrailingPageWrapNoise(pageContent));
3558
- };
3559
- const extractLastArabicWord = (pageContent) => {
3560
- return [...trimTrailingPageWrapNoise(pageContent).replace(TRAILING_WORD_DELIMITERS, "").matchAll(ARABIC_WORD_REGEX)].at(-1)?.[0] ?? "";
3561
- };
3562
- const shouldAllowPageStartMatch = (previousPageContent, prevWordStoplist) => {
3563
- if (!prevWordStoplist || endsWithStrongSentenceTerminator(previousPageContent)) return true;
3564
- const lastWord = extractLastArabicWord(previousPageContent);
3565
- return !lastWord || !prevWordStoplist.has(normalizeArabicForComparison(lastWord));
3566
- };
3567
- const shouldAllowSamePageMatch = (contentBeforeMatch, stoplist) => {
3568
- if (!stoplist) return true;
3569
- const lastWord = extractLastArabicWord(contentBeforeMatch);
3570
- return !lastWord || !stoplist.has(normalizeArabicForComparison(lastWord));
3571
- };
3572
- const createPageStartGuardChecker = (matchContent, pageMap) => {
3573
- const pageStartToBoundaryIndex = new Map(pageMap.boundaries.map((b, i) => [b.start, i]));
3574
- const compiledPageStartPrev = /* @__PURE__ */ new Map();
3575
- const compiledPrevWordStoplists = /* @__PURE__ */ new Map();
3576
- const compiledSamePagePrevWordStoplists = /* @__PURE__ */ new Map();
3577
- const pageIdToBoundaryIndex = new Map(pageMap.boundaries.map((b, i) => [b.id, i]));
3578
- const getPageStartPrevRegex = (rule, ruleIndex) => {
3579
- if (compiledPageStartPrev.has(ruleIndex)) return compiledPageStartPrev.get(ruleIndex) ?? null;
3580
- const pattern = rule.pageStartGuard;
3581
- if (!pattern) {
3582
- compiledPageStartPrev.set(ruleIndex, null);
3583
- return null;
3584
- }
3585
- const re = new RegExp(`(?:${processPattern(pattern, false).pattern})$`, "u");
3586
- compiledPageStartPrev.set(ruleIndex, re);
3587
- return re;
3588
- };
3589
- const getPrevWordStoplist = (rule, ruleIndex) => {
3590
- if (compiledPrevWordStoplists.has(ruleIndex)) return compiledPrevWordStoplists.get(ruleIndex) ?? null;
3591
- const stoplist = rule.pageStartPrevWordStoplist;
3592
- if (!stoplist?.length) {
3593
- compiledPrevWordStoplists.set(ruleIndex, null);
3594
- return null;
3595
- }
3596
- const normalized = new Set(stoplist.map((word) => normalizeArabicForComparison(word)).filter(Boolean));
3597
- compiledPrevWordStoplists.set(ruleIndex, normalized);
3598
- return normalized;
3599
- };
3600
- const getSamePagePrevWordStoplist = (rule, ruleIndex) => {
3601
- if (compiledSamePagePrevWordStoplists.has(ruleIndex)) return compiledSamePagePrevWordStoplists.get(ruleIndex) ?? null;
3602
- const stoplist = rule.samePagePrevWordStoplist;
3603
- if (!stoplist?.length) {
3604
- compiledSamePagePrevWordStoplists.set(ruleIndex, null);
3605
- return null;
3606
- }
3607
- const normalized = new Set(stoplist.map((word) => normalizeArabicForComparison(word)).filter(Boolean));
3608
- compiledSamePagePrevWordStoplists.set(ruleIndex, normalized);
3609
- return normalized;
3610
- };
3611
- const getPreviousPageContent = (boundaryIndex) => {
3612
- if (boundaryIndex <= 0) return "";
3613
- const prevBoundary = pageMap.boundaries[boundaryIndex - 1];
3614
- return matchContent.slice(prevBoundary.start, prevBoundary.end);
3615
- };
3616
- const getPrevPageLastNonWsChar = (boundaryIndex) => {
3617
- if (boundaryIndex <= 0) return "";
3618
- const prevBoundary = pageMap.boundaries[boundaryIndex - 1];
3619
- for (let i = prevBoundary.end - 1; i >= prevBoundary.start; i--) {
3620
- const ch = matchContent[i];
3621
- if (ch && !/\s/u.test(ch)) return ch;
3622
- }
3623
- return "";
3624
- };
3625
- const getCurrentPageContentBeforeMatch = (matchStart) => {
3626
- const pageId = pageMap.getId(matchStart);
3627
- const boundaryIndex = pageIdToBoundaryIndex.get(pageId);
3628
- if (boundaryIndex === void 0) return "";
3629
- const boundary = pageMap.boundaries[boundaryIndex];
3630
- return matchContent.slice(boundary.start, matchStart);
3631
- };
3632
- return (rule, ruleIndex, matchStart) => {
3633
- const boundaryIndex = pageStartToBoundaryIndex.get(matchStart);
3634
- if (boundaryIndex !== void 0 && boundaryIndex !== 0) {
3635
- const prevReq = getPageStartPrevRegex(rule, ruleIndex);
3636
- if (prevReq) {
3637
- const lastChar = getPrevPageLastNonWsChar(boundaryIndex);
3638
- if (!lastChar || !prevReq.test(lastChar)) return false;
3639
- }
3640
- return shouldAllowPageStartMatch(getPreviousPageContent(boundaryIndex), getPrevWordStoplist(rule, ruleIndex));
3641
- }
3642
- return shouldAllowSamePageMatch(getCurrentPageContentBeforeMatch(matchStart), getSamePagePrevWordStoplist(rule, ruleIndex));
3643
- };
3644
- };
3645
- /**
3646
- * Checks if a pageId matches the min/max/exclude constraints of a rule.
3647
- */
3648
- const passesRuleConstraints$1 = (rule, pageId) => (rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude);
3649
- /**
3650
- * Records a split point for a specific rule.
3651
- */
3652
- const recordSplitPointAt = (splitPointsByRule, ruleIndex, sp) => {
3653
- const arr = splitPointsByRule.get(ruleIndex);
3654
- if (!arr) splitPointsByRule.set(ruleIndex, [sp]);
3655
- else arr.push(sp);
3656
- };
3657
- const attemptFastFuzzyMatch = (matchContent, lineStart, { compiled, kind, rule, ruleIndex }, splitPointsByRule) => {
3658
- const end = matchFastFuzzyTokenAt(matchContent, lineStart, compiled);
3659
- if (end === null) return;
3660
- const splitAt = rule.split ?? "at";
3661
- const splitIndex = splitAt === "at" ? lineStart : end;
3662
- if (kind === "startsWith") recordSplitPointAt(splitPointsByRule, ruleIndex, {
3663
- index: splitIndex,
3664
- meta: rule.meta
3665
- });
3666
- else {
3667
- const markerLength = end - lineStart;
3668
- recordSplitPointAt(splitPointsByRule, ruleIndex, {
3669
- contentStartOffset: splitAt === "at" ? markerLength : void 0,
3670
- index: splitIndex,
3671
- meta: rule.meta
3672
- });
3673
- }
3674
- };
3675
- /**
3676
- * Processes matches for all fast-fuzzy rules at a specific line start.
3677
- */
3678
- const processFastFuzzyMatchesAt = (matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard, splitPointsByRule) => {
3679
- for (const ffRule of fastFuzzyRules) {
3680
- if (!passesRuleConstraints$1(ffRule.rule, pageId)) continue;
3681
- if (!passesPageStartGuard(ffRule.rule, ffRule.ruleIndex, lineStart)) continue;
3682
- attemptFastFuzzyMatch(matchContent, lineStart, ffRule, splitPointsByRule);
3683
- }
3684
- };
3685
- const collectFastFuzzySplitPoints = (matchContent, pageMap, fastFuzzyRules, passesPageStartGuard) => {
3686
- const splitPointsByRule = /* @__PURE__ */ new Map();
3687
- if (fastFuzzyRules.length === 0 || pageMap.boundaries.length === 0) return splitPointsByRule;
3688
- let boundaryIdx = 0;
3689
- let currentBoundary = pageMap.boundaries[boundaryIdx];
3690
- const advanceBoundaryTo = (offset) => {
3691
- while (currentBoundary && offset > currentBoundary.end && boundaryIdx < pageMap.boundaries.length - 1) {
3692
- boundaryIdx++;
3693
- currentBoundary = pageMap.boundaries[boundaryIdx];
3694
- }
3695
- };
3696
- for (let lineStart = 0; lineStart <= matchContent.length;) {
3697
- advanceBoundaryTo(lineStart);
3698
- const pageId = currentBoundary?.id ?? 0;
3699
- if (lineStart >= matchContent.length) break;
3700
- processFastFuzzyMatchesAt(matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard, splitPointsByRule);
3701
- const nextNl = matchContent.indexOf("\n", lineStart);
3702
- if (nextNl === -1) break;
3703
- lineStart = nextNl + 1;
3704
- }
3705
- return splitPointsByRule;
3706
- };
3707
- //#endregion
3708
- //#region src/segmentation/split-point-helpers.ts
3709
- const MAX_REGEX_ITERATIONS = 1e5;
3710
- const extractNamedCapturesForRule = (groups, captureNames, prefix) => {
3711
- const result = {};
3712
- if (!groups) return result;
3713
- for (const name of captureNames) if (groups[name] !== void 0) result[name.slice(prefix.length)] = groups[name];
3714
- return result;
3715
- };
3716
- const buildContentOffsets = (match, ruleInfo) => {
3717
- if (!ruleInfo.usesLineStartsAfter) return {};
3718
- const captured = match.groups?.[`${ruleInfo.prefix}__content`];
3719
- if (captured === void 0) return {};
3720
- return { contentStartOffset: (match.groups?.[ruleInfo.prefix] ?? match[0]).length - captured.length };
3721
- };
3722
- const passesRuleConstraints = (rule, pageId) => (rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude);
3723
- const createSplitPointFromMatch = (match, rule, ruleInfo) => {
3724
- const namedCaptures = extractNamedCapturesForRule(match.groups, ruleInfo.captureNames, ruleInfo.prefix);
3725
- const wordIndex = extractDebugIndex(match.groups, "_r");
3726
- return {
3727
- capturedContent: void 0,
3728
- contentStartOffset: buildContentOffsets(match, ruleInfo).contentStartOffset,
3729
- index: (rule.split ?? "at") === "at" ? match.index : match.index + match[0].length,
3730
- meta: rule.meta,
3731
- namedCaptures: Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0,
3732
- wordIndex
3733
- };
3734
- };
3735
- const addSplitPoint = (splitPointsByRule, originalIndex, point) => {
3736
- const arr = splitPointsByRule.get(originalIndex);
3737
- if (!arr) {
3738
- splitPointsByRule.set(originalIndex, [point]);
3739
- return;
3740
- }
3741
- arr.push(point);
3742
- };
3743
- /**
3744
- * Executes a combined regex over the content for combinable rules and records
3745
- * any resulting split points into `splitPointsByRule`.
3746
- *
3747
- * This function mutates `splitPointsByRule` in place and throws if the regex
3748
- * iteration guard is exceeded.
3749
- *
3750
- * @param matchContent - Concatenated content being segmented
3751
- * @param combinableRules - Rules that can be combined into a single alternation
3752
- * @param ruleRegexes - Compiled regex metadata aligned with `combinableRules`
3753
- * @param pageMap - Page boundary mapping utilities for the content
3754
- * @param passesPageStartGuard - Callback that decides whether a match is allowed
3755
- * @param splitPointsByRule - Mutable map collecting split points by rule index
3756
- * @param logger - Optional logger for iteration diagnostics
3757
- * @returns Nothing; results are written into `splitPointsByRule`
3758
- */
3759
- const processCombinedMatches = (matchContent, combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, logger) => {
3760
- assertCombinedRuleAlignment(combinableRules, ruleRegexes);
3761
- const combinedSource = ruleRegexes.map((r) => r.source).join("|");
3762
- const combinedRegex = new RegExp(combinedSource, "gm");
3763
- logger?.debug?.("[segmenter] combined regex built", {
3764
- combinableRuleCount: combinableRules.length,
3765
- combinedSourceLength: combinedSource.length
3766
- });
3767
- let m = combinedRegex.exec(matchContent);
3768
- let iterations = 0;
3769
- while (m !== null) {
3770
- if (++iterations > MAX_REGEX_ITERATIONS) throw new Error(`[segmenter] Possible infinite loop: exceeded ${MAX_REGEX_ITERATIONS} iterations at position ${m.index}.`);
3771
- if (iterations % 1e4 === 0) logger?.warn?.("[segmenter] high iteration count", {
3772
- iterations,
3773
- position: m.index
3774
- });
3775
- processCombinedMatch(combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, m);
3776
- if (m[0].length === 0) combinedRegex.lastIndex++;
3777
- m = combinedRegex.exec(matchContent);
3778
- }
3779
- };
3780
- const assertCombinedRuleAlignment = (combinableRules, ruleRegexes) => {
3781
- if (combinableRules.length !== ruleRegexes.length) throw new Error(`processCombinedMatches: combinableRules/ruleRegexes length mismatch (${combinableRules.length} !== ${ruleRegexes.length})`);
3782
- for (let i = 0; i < combinableRules.length; i++) if (!ruleRegexes[i].source.includes(`(?<${combinableRules[i].prefix}>`)) throw new Error(`processCombinedMatches: regex alignment mismatch for prefix "${combinableRules[i].prefix}" at index ${i}`);
3783
- };
3784
- const processCombinedMatch = (combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, match) => {
3785
- const matchedIndex = combinableRules.findIndex(({ prefix }) => match.groups?.[prefix] !== void 0);
3786
- if (matchedIndex === -1) return;
3787
- const { rule, index: originalIndex } = combinableRules[matchedIndex];
3788
- if (!passesRuleConstraints(rule, pageMap.getId(match.index)) || !passesPageStartGuard(rule, originalIndex, match.index)) return;
3789
- addSplitPoint(splitPointsByRule, originalIndex, createSplitPointFromMatch(match, rule, ruleRegexes[matchedIndex]));
3790
- };
3791
- /**
3792
- * Builds compiled regex metadata for each combinable rule while preserving the
3793
- * prefix used to identify the matching branch inside a combined alternation.
3794
- *
3795
- * @param combinableRules - Rules eligible for combined-regex processing
3796
- * @returns Rule regex metadata aligned with the input order
3797
- */
3798
- const buildRuleRegexes = (combinableRules) => combinableRules.map(({ rule, prefix }) => {
3799
- const built = buildRuleRegex(rule, prefix);
3800
- return {
3801
- ...built,
3802
- prefix,
3803
- source: `(?<${prefix}>${built.regex.source})`
3804
- };
3805
- });
3806
- /**
3807
- * Processes a standalone rule by matching it independently and appending its
3808
- * resulting split points into `splitPointsByRule`.
3809
- *
3810
- * @param rule - The standalone split rule to evaluate
3811
- * @param ruleIndex - Original rule index in the caller's rules array
3812
- * @param matchContent - Concatenated content being segmented
3813
- * @param pageMap - Page boundary mapping utilities for the content
3814
- * @param passesPageStartGuard - Callback that decides whether a match is allowed
3815
- * @param splitPointsByRule - Mutable map collecting split points by rule index
3816
- * @returns Nothing; results are written into `splitPointsByRule`
3817
- */
3818
- const processStandaloneRule = (rule, ruleIndex, matchContent, pageMap, passesPageStartGuard, splitPointsByRule) => {
3819
- const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
3820
- const points = filterByConstraints(findMatchesInContent(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).filter((m) => passesPageStartGuard(rule, ruleIndex, m.start)).map((m) => {
3821
- const isLSA = usesLineStartsAfter && m.captured !== void 0;
3822
- return {
3823
- capturedContent: isLSA ? void 0 : m.captured,
3824
- contentStartOffset: isLSA ? m.end - m.captured.length - m.start : void 0,
3825
- index: (rule.split ?? "at") === "at" ? m.start : m.end,
3826
- meta: rule.meta,
3827
- namedCaptures: m.namedCaptures,
3828
- wordIndex: m.wordIndex
3829
- };
3830
- });
3831
- const arr = splitPointsByRule.get(ruleIndex);
3832
- if (!arr) splitPointsByRule.set(ruleIndex, points);
3833
- else arr.push(...points);
3834
- };
3835
- const findMatchesInContent = (content, regex, usesCapture, captureNames) => {
3836
- const matches = [];
3837
- let m = regex.exec(content);
3838
- while (m !== null) {
3839
- const wordIndex = extractDebugIndex(m.groups, "_r");
3840
- matches.push({
3841
- captured: usesCapture ? getLastPositionalCapture(m) : void 0,
3842
- end: m.index + m[0].length,
3843
- namedCaptures: extractNamedCaptures(m.groups, captureNames),
3844
- start: m.index,
3845
- wordIndex
3846
- });
3847
- if (m[0].length === 0) regex.lastIndex++;
3848
- m = regex.exec(content);
3849
- }
3850
- return matches;
3851
- };
3852
- /**
3853
- * Applies per-rule occurrence filtering and optional debug metadata patches to
3854
- * the collected split points.
3855
- *
3856
- * @param rules - Full rule list in original order
3857
- * @param splitPointsByRule - Split points grouped by originating rule index
3858
- * @param debugMetaKey - Optional metadata key used for debug provenance patches
3859
- * @returns Flattened split points after occurrence filtering and debug merging
3860
- */
3861
- const applyOccurrenceFilter = (rules, splitPointsByRule, debugMetaKey) => {
3862
- const result = [];
3863
- rules.forEach((rule, index) => {
3864
- const points = splitPointsByRule.get(index);
3865
- if (!points?.length) return;
3866
- const filtered = rule.occurrence === "first" ? [points[0]] : rule.occurrence === "last" ? [points.at(-1)] : points;
3867
- result.push(...filtered.map((p) => {
3868
- const debugPatch = debugMetaKey ? buildRuleDebugPatch(index, rule, p.wordIndex) : null;
3869
- return {
3870
- ...p,
3871
- meta: debugMetaKey ? mergeDebugIntoMeta(p.meta, debugMetaKey, debugPatch) : p.meta,
3872
- ruleIndex: index
3873
- };
3874
- }));
3875
- });
3876
- return result;
3877
- };
3878
- //#endregion
3879
- //#region src/segmentation/segmenter.ts
3880
- /**
3881
- * Builds a concatenated content string and page mapping from input pages.
3882
- *
3883
- * Pages are joined with newline characters, and a page map is created to
3884
- * track which page each offset belongs to. This allows pattern matching
3885
- * across page boundaries while preserving page reference information.
3886
- *
3887
- * @param pages - Array of input pages with id and content
3888
- * @returns Concatenated content string and page mapping utilities
3889
- *
3890
- * @example
3891
- * const pages = [
3892
- * { id: 1, content: 'Page 1 text' },
3893
- * { id: 2, content: 'Page 2 text' }
3894
- * ];
3895
- * const { content, pageMap } = buildPageMap(pages);
3896
- * // content = 'Page 1 text\nPage 2 text'
3897
- * // pageMap.getId(0) = 1
3898
- * // pageMap.getId(12) = 2
3899
- */
3900
- const buildPageMap = (pages) => {
3901
- const boundaries = [];
3902
- const pageBreaks = [];
3903
- let offset = 0;
3904
- const parts = [];
3905
- for (let i = 0; i < pages.length; i++) {
3906
- const normalized = normalizeLineEndings(pages[i].content);
3907
- boundaries.push({
3908
- end: offset + normalized.length,
3909
- id: pages[i].id,
3910
- start: offset
3911
- });
3912
- parts.push(normalized);
3913
- if (i < pages.length - 1) {
3914
- pageBreaks.push(offset + normalized.length);
3915
- offset += normalized.length + 1;
3916
- } else offset += normalized.length;
3917
- }
3918
- const findBoundary = (off) => {
3919
- let lo = 0, hi = boundaries.length - 1;
3920
- while (lo <= hi) {
3921
- const mid = lo + hi >>> 1;
3922
- const b = boundaries[mid];
3923
- if (off < b.start) hi = mid - 1;
3924
- else if (off > b.end) lo = mid + 1;
3925
- else return b;
3926
- }
3927
- return boundaries.at(-1);
3928
- };
3929
- return {
3930
- content: parts.join("\n"),
3931
- normalizedPages: parts,
3932
- pageMap: {
3933
- boundaries,
3934
- getId: (off) => findBoundary(off)?.id ?? 0,
3935
- pageBreaks,
3936
- pageIds: boundaries.map((b) => b.id)
3937
- }
3938
- };
3939
- };
3940
- /**
3941
- * Deduplicate split points by index, preferring ones with more information.
3942
- *
3943
- * Preference rules (when same index):
3944
- * - Prefer a split with `contentStartOffset` (needed for `lineStartsAfter` marker stripping)
3945
- * - Otherwise prefer a split with `meta` over one without
3946
- */
3947
- const dedupeSplitPoints = (splitPoints) => {
3948
- const byIndex = /* @__PURE__ */ new Map();
3949
- for (const p of splitPoints) {
3950
- const existing = byIndex.get(p.index);
3951
- if (!existing) {
3952
- byIndex.set(p.index, p);
3953
- continue;
3954
- }
3955
- byIndex.set(p.index, mergeSplitPoints(existing, p));
3956
- }
3957
- return [...byIndex.values()].sort((a, b) => a.index - b.index);
3958
- };
3959
- const prefersIncomingSplitPoint = (existing, incoming) => incoming.contentStartOffset !== void 0 && existing.contentStartOffset === void 0 || incoming.meta !== void 0 && existing.meta === void 0;
3960
- const mergeRecord = (existing, incoming) => existing || incoming ? {
3961
- ...existing ?? {},
3962
- ...incoming ?? {}
3963
- } : void 0;
3964
- const mergeSplitPoints = (existing, incoming) => {
3965
- const preferred = prefersIncomingSplitPoint(existing, incoming) ? incoming : existing;
3966
- const fallback = preferred === incoming ? existing : incoming;
3967
- return {
3968
- ...fallback,
3969
- ...preferred,
3970
- contentStartOffset: preferred.contentStartOffset ?? fallback.contentStartOffset,
3971
- meta: mergeRecord(existing.meta, incoming.meta),
3972
- namedCaptures: mergeRecord(existing.namedCaptures, incoming.namedCaptures)
3973
- };
3974
- };
3975
- /**
3976
- * If no structural rules produced segments, create a single segment spanning all pages.
3977
- * This allows breakpoint processing to still run.
3978
- */
3979
- const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) => {
3980
- if (segments.length > 0 || pages.length === 0) return segments;
3981
- const firstPage = pages[0];
3982
- const lastPage = pages.at(-1);
3983
- const joiner = pageJoiner === "newline" ? "\n" : " ";
3984
- const allContent = normalizedContent.join(joiner).replace(/\s+$/u, "");
3985
- if (!allContent.trim()) return segments;
3986
- const initialSeg = {
3987
- content: allContent,
3988
- from: firstPage.id
3989
- };
3990
- if (lastPage.id !== firstPage.id) initialSeg.to = lastPage.id;
3991
- return [initialSeg];
3992
- };
3993
- const collectSplitPointsFromRules = (rules, matchContent, pageMap, debugMetaKey, logger) => {
3994
- logger?.debug?.("[segmenter] collecting split points from rules", {
3995
- contentLength: matchContent.length,
3996
- ruleCount: rules.length
3997
- });
3998
- const passesPageStartGuard = createPageStartGuardChecker(matchContent, pageMap);
3999
- const { combinableRules, fastFuzzyRules, standaloneRules } = partitionRulesForMatching(rules);
4000
- logger?.debug?.("[segmenter] rules partitioned", {
4001
- combinableCount: combinableRules.length,
4002
- fastFuzzyCount: fastFuzzyRules.length,
4003
- standaloneCount: standaloneRules.length
4004
- });
4005
- const splitPointsByRule = collectFastFuzzySplitPoints(matchContent, pageMap, fastFuzzyRules, passesPageStartGuard);
4006
- if (combinableRules.length > 0) processCombinedMatches(matchContent, combinableRules, buildRuleRegexes(combinableRules), pageMap, passesPageStartGuard, splitPointsByRule, logger);
4007
- for (const { rule, index } of standaloneRules) processStandaloneRule(rule, index, matchContent, pageMap, passesPageStartGuard, splitPointsByRule);
4008
- return applyOccurrenceFilter(rules, splitPointsByRule, debugMetaKey);
4009
- };
4010
- /**
4011
- * Finds page breaks within a given offset range using binary search.
4012
- * O(log n + k) where n = total breaks, k = breaks in range.
4013
- *
4014
- * @param startOffset - Start of range (inclusive)
4015
- * @param endOffset - End of range (exclusive)
4016
- * @param sortedBreaks - Sorted array of page break offsets
4017
- * @returns Array of break offsets relative to startOffset
4018
- */
4019
- const findBreaksInRange = (startOffset, endOffset, sortedBreaks) => {
4020
- if (sortedBreaks.length === 0) return [];
4021
- let lo = 0, hi = sortedBreaks.length;
4022
- while (lo < hi) {
4023
- const mid = lo + hi >>> 1;
4024
- if (sortedBreaks[mid] < startOffset) lo = mid + 1;
4025
- else hi = mid;
4026
- }
4027
- const result = [];
4028
- for (let i = lo; i < sortedBreaks.length && sortedBreaks[i] < endOffset; i++) result.push(sortedBreaks[i] - startOffset);
4029
- return result;
4030
- };
4031
- /**
4032
- * Converts page-break newlines to spaces in segment content.
4033
- *
4034
- * When a segment spans multiple pages, the newline characters that were
4035
- * inserted as page separators during concatenation are converted to spaces
4036
- * for more natural reading.
4037
- *
4038
- * Uses binary search for O(log n + k) lookup instead of O(n) iteration.
4039
- *
4040
- * @param content - Segment content string
4041
- * @param startOffset - Starting offset of this content in concatenated string
4042
- * @param pageBreaks - Sorted array of page break offsets
4043
- * @param pageJoiner - How to represent page boundaries in output (`space` vs `newline`)
4044
- * @returns Content with page-break newlines converted to spaces (or left as-is for `newline`)
4045
- */
4046
- const convertPageBreaks = (content, startOffset, pageBreaks, pageJoiner) => {
4047
- if (!content?.includes("\n")) return content;
4048
- if (pageJoiner === "newline") return content;
4049
- const breaksInRange = findBreaksInRange(startOffset, startOffset + content.length, pageBreaks);
4050
- if (breaksInRange.length === 0) return content;
4051
- const breakSet = new Set(breaksInRange);
4052
- return content.replace(/\n/g, (match, offset) => breakSet.has(offset) ? " " : match);
4053
- };
4054
- /**
4055
- * Segments pages of content based on pattern-matching rules.
4056
- *
4057
- * This is the main entry point for the segmentation engine. It takes an array
4058
- * of pages and applies the provided rules to identify split points, producing
4059
- * an array of segments with content, page references, and metadata.
4060
- *
4061
- * @param pages - Array of pages with id and content
4062
- * @param options - Segmentation options including splitting rules
4063
- * @returns Array of segments with content, from/to page references, and optional metadata
4064
- *
4065
- * @example
4066
- * // Split markdown by headers
4067
- * const segments = segmentPages(pages, {
4068
- * rules: [
4069
- * { lineStartsWith: ['## '], split: 'at', meta: { type: 'chapter' } }
4070
- * ]
4071
- * });
4072
- *
4073
- * @example
4074
- * // Split Arabic hadith text with number extraction
4075
- * const segments = segmentPages(pages, {
4076
- * rules: [
4077
- * {
4078
- * lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '],
4079
- * split: 'at',
4080
- * fuzzy: true,
4081
- * meta: { type: 'hadith' }
4082
- * }
4083
- * ]
4084
- * });
4085
- *
4086
- * @example
4087
- * // Multiple rules with page constraints
4088
- * const segments = segmentPages(pages, {
4089
- * rules: [
4090
- * { lineStartsWith: ['{{kitab}}'], split: 'at', meta: { type: 'book' } },
4091
- * { lineStartsWith: ['{{bab}}'], split: 'at', min: 10, meta: { type: 'chapter' } },
4092
- * { regex: '^[٠-٩]+ - ', split: 'at', meta: { type: 'hadith' } }
4093
- * ]
4094
- * });
4095
- */
4096
- const segmentPages = (pages, options) => {
4097
- const { rules = [], breakpoints = [], prefer = "longer", pageJoiner = "space", logger, maxContentLength, preprocess } = options;
4098
- if (maxContentLength && maxContentLength < 50) throw new Error(`maxContentLength must be at least 50 characters.`);
4099
- const maxPages = options.maxPages ?? Number.MAX_SAFE_INTEGER;
4100
- const hasLimits = options.maxPages !== void 0 || maxContentLength !== void 0;
4101
- const debug = resolveDebugConfig(options.debug);
4102
- const debugMetaKey = debug?.includeRule ? debug.metaKey : void 0;
4103
- logger?.info?.("[segmenter] starting segmentation", {
4104
- breakpointCount: breakpoints.length,
4105
- maxContentLength,
4106
- maxPages,
4107
- pageCount: pages.length,
4108
- prefer,
4109
- preprocessCount: preprocess?.length ?? 0,
4110
- ruleCount: rules.length
4111
- });
4112
- const preprocessedPages = preprocess && preprocess.length > 0 ? pages.map((page) => ({
4113
- ...page,
4114
- content: applyPreprocessToPage(page.content, page.id, preprocess)
4115
- })) : pages;
4116
- const { content: matchContent, normalizedPages: normalizedContent, pageMap } = buildPageMap(preprocessedPages);
4117
- logger?.debug?.("[segmenter] content built", {
4118
- pageIds: pageMap.pageIds,
4119
- totalContentLength: matchContent.length
4120
- });
4121
- const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap, debugMetaKey, logger);
4122
- const unique = dedupeSplitPoints(splitPoints);
4123
- logger?.debug?.("[segmenter] split points collected", {
4124
- rawSplitPoints: splitPoints.length,
4125
- uniqueSplitPoints: unique.length
4126
- });
4127
- let segments = buildSegments(unique, matchContent, pageMap, rules, pageJoiner);
4128
- logger?.debug?.("[segmenter] structural segments built", { segmentCount: segments.length });
4129
- segments = ensureFallbackSegment(segments, preprocessedPages, normalizedContent, pageJoiner);
4130
- if (hasLimits) {
4131
- logger?.debug?.("[segmenter] applying breakpoints to oversized segments");
4132
- const result = applyBreakpoints(segments, preprocessedPages, normalizedContent, maxPages, breakpoints, prefer, (p) => processPattern(p, false).pattern, logger, pageJoiner, debug?.includeBreakpoint ? debug.metaKey : void 0, maxContentLength, processBreakpointPattern);
4133
- logger?.info?.("[segmenter] segmentation complete (with breakpoints)", { finalSegmentCount: result.length });
4134
- return result;
4135
- }
4136
- logger?.info?.("[segmenter] segmentation complete (structural only)", { finalSegmentCount: segments.length });
4137
- return segments;
4138
- };
4139
- /**
4140
- * Creates segment objects from split points.
4141
- *
4142
- * Handles segment creation including:
4143
- * - Content extraction (with captured content for `lineStartsAfter`)
4144
- * - Page break conversion to spaces
4145
- * - From/to page reference calculation
4146
- * - Metadata merging (static + named captures)
4147
- *
4148
- * @param splitPoints - Sorted, unique split points
4149
- * @param content - Full concatenated content string
4150
- * @param pageMap - Page mapping utilities
4151
- * @param rules - Original rules (for constraint checking on first segment)
4152
- * @returns Array of segment objects
4153
- */
4154
- const buildSegments = (splitPoints, content, pageMap, rules, pageJoiner) => {
4155
- const getActualStart = (start, contentStartOffset) => start + (contentStartOffset ?? 0);
4156
- const trimSegmentText = (sliced, capturedContent, contentStartOffset) => capturedContent?.trim() ?? (contentStartOffset ? sliced.trim() : sliced.replace(/[\s\n]+$/, ""));
4157
- const getAdjustedStart = (actualStart, sliced, contentStartOffset) => actualStart + (contentStartOffset ? sliced.length - sliced.trimStart().length : 0);
4158
- const applyMeta = (meta, namedCaptures) => meta || namedCaptures ? {
4159
- ...meta,
4160
- ...namedCaptures
4161
- } : void 0;
4162
- /**
4163
- * Creates a single segment from a content range.
4164
- */
4165
- const createSegment = (start, end, meta, capturedContent, namedCaptures, contentStartOffset) => {
4166
- const actualStart = getActualStart(start, contentStartOffset);
4167
- const sliced = content.slice(actualStart, end);
4168
- let text = trimSegmentText(sliced, capturedContent, contentStartOffset);
4169
- if (!text) return null;
4170
- if (!capturedContent) text = convertPageBreaks(text, actualStart, pageMap.pageBreaks, pageJoiner);
4171
- const adjustedStart = getAdjustedStart(actualStart, sliced, contentStartOffset);
4172
- const from = pageMap.getId(adjustedStart);
4173
- const to = capturedContent ? pageMap.getId(end - 1) : pageMap.getId(adjustedStart + text.length - 1);
4174
- const seg = {
4175
- content: text,
4176
- from
4177
- };
4178
- if (to !== from) seg.to = to;
4179
- const mergedMeta = applyMeta(meta, namedCaptures);
4180
- if (mergedMeta) seg.meta = mergedMeta;
4181
- return seg;
4182
- };
4183
- /**
4184
- * Creates segments from an array of split points.
4185
- */
4186
- const createSegmentsFromSplitPoints = () => {
4187
- const result = [];
4188
- for (let i = 0; i < splitPoints.length; i++) {
4189
- const sp = splitPoints[i];
4190
- const end = splitPoints[i + 1]?.index ?? content.length;
4191
- const s = createSegment(sp.index, end, sp.meta, sp.capturedContent, sp.namedCaptures, sp.contentStartOffset);
4192
- if (s) result.push(s);
4193
- }
4194
- return result;
4195
- };
4196
- const segments = [];
4197
- if (!splitPoints.length) {
4198
- if (anyRuleAllowsId(rules, pageMap.getId(0))) {
4199
- const s = createSegment(0, content.length);
4200
- if (s) segments.push(s);
4201
- }
4202
- return segments;
4203
- }
4204
- if (splitPoints[0].index > 0) {
4205
- if (anyRuleAllowsId(rules, pageMap.getId(0))) {
4206
- const s = createSegment(0, splitPoints[0].index);
4207
- if (s) segments.push(s);
4208
- }
4209
- }
4210
- return [...segments, ...createSegmentsFromSplitPoints()];
4211
- };
4212
- //#endregion
4213
- //#region src/validation/validate-segments.ts
4214
- /**
4215
- * Creates a short preview string of text content for error reporting.
4216
- * Truncates content exceeding PREVIEW_LIMIT.
4217
- */
4218
- const buildPreview = (text) => {
4219
- const normalized = text.replace(/\s+/g, " ").trim();
4220
- if (normalized.length <= 140) return normalized;
4221
- return `${normalized.slice(0, 140)}...`;
4222
- };
4223
- /**
4224
- * Creates a lightweight snapshot of a segment for inclusion in validation checks.
4225
- */
4226
- const buildSegmentSnapshot = (segment) => ({
4227
- contentPreview: buildPreview(segment.content),
4228
- from: segment.from,
4229
- to: segment.to
4230
- });
4231
- /**
4232
- * Normalizes page content by applying preprocessing transforms and standardizing line endings.
4233
- */
4234
- const normalizePages = (pages, options) => {
4235
- const transforms = options.preprocess ?? [];
4236
- return pages.map((page) => {
4237
- return {
4238
- content: normalizeLineEndings(transforms.length ? applyPreprocessToPage(page.content, page.id, transforms) : page.content),
4239
- id: page.id
4240
- };
4241
- });
4242
- };
4243
- /**
4244
- * Joins all page content into a single string with boundary tracking.
4245
- * Returns the joined string and a list of boundary mappings (start/end indices for each page).
4246
- */
4247
- const buildJoinedContent = (pages, joiner) => {
4248
- const boundaries = [];
4249
- const joined = pages.map((p) => p.content).join(joiner);
4250
- let offset = 0;
4251
- for (let i = 0; i < pages.length; i++) {
4252
- const content = pages[i].content;
4253
- const start = offset;
4254
- const end = start + content.length;
4255
- boundaries.push({
4256
- end,
4257
- id: pages[i].id,
4258
- start
4259
- });
4260
- offset += content.length + (i < pages.length - 1 ? joiner.length : 0);
4261
- }
4262
- return {
4263
- boundaries,
4264
- joined
4265
- };
4266
- };
4267
- /**
4268
- * Binary search to find which page ID corresponds to a character offset in the joined content.
4269
- * Returns undefined if the offset falls within a joiner gap or outside bounds.
4270
- */
4271
- const findBoundaryIdForOffset = (offset, boundaries) => {
4272
- let lo = 0;
4273
- let hi = boundaries.length - 1;
4274
- while (lo <= hi) {
4275
- const mid = lo + hi >>> 1;
4276
- const boundary = boundaries[mid];
4277
- if (offset < boundary.start) hi = mid - 1;
4278
- else if (offset > boundary.end) lo = mid + 1;
4279
- else return boundary.id;
4280
- }
4281
- if (boundaries.length === 0) return;
4282
- const last = boundaries.at(-1);
4283
- return offset > last.end ? last.id : void 0;
4284
- };
4285
- /**
4286
- * Helper to construct a standardized validation issue object.
4287
- */
4288
- const createIssue = (type, segment, segmentIndex, overrides = {}, pageMap) => {
4289
- const segmentSnapshot = buildSegmentSnapshot(segment);
4290
- const page = pageMap?.get(segment.from);
4291
- const matchIndex = overrides.matchIndex;
4292
- const { matchIndex: _ignored, ...restOverrides } = overrides;
4293
- const base = {
4294
- actual: {
4295
- from: segment.from,
4296
- to: segment.to
4297
- },
4298
- segment: segmentSnapshot,
4299
- segmentIndex,
4300
- ...restOverrides
4301
- };
4302
- switch (type) {
4303
- case "page_not_found": return {
4304
- ...base,
4305
- evidence: overrides.evidence ?? `Segment.from=${segment.from} does not exist in input pages.`,
4306
- hint: "Check page IDs passed into segmentPages() and validateSegments().",
4307
- severity: "error",
4308
- type
4309
- };
4310
- case "content_not_found": return {
4311
- ...base,
4312
- evidence: overrides.evidence ?? "Segment content not found in any page content.",
4313
- hint: overrides.hint ?? "Check preprocessing options, joiner settings, or whitespace normalization.",
4314
- pageContext: page ? {
4315
- pageId: page.id,
4316
- pagePreview: buildPreview(page.content)
4317
- } : void 0,
4318
- severity: "error",
4319
- type
4320
- };
4321
- case "page_attribution_mismatch": {
4322
- const matchedFromId = overrides.expected?.from ?? overrides.actual?.from ?? segment.from;
4323
- const actualPage = pageMap?.get(matchedFromId);
4324
- return {
4325
- ...base,
4326
- evidence: overrides.evidence ?? `Content found in joined content at page ${matchedFromId}, but segment.from=${segment.from}.`,
4327
- hint: overrides.hint ?? "Check duplicate content handling and boundary detection rules.",
4328
- pageContext: actualPage ? {
4329
- matchIndex: matchIndex ?? -1,
4330
- pageId: actualPage.id,
4331
- pagePreview: buildPreview(actualPage.content)
4332
- } : void 0,
4333
- severity: "error",
4334
- type
4335
- };
4336
- }
4337
- case "max_pages_violation": return {
4338
- ...base,
4339
- evidence: overrides.evidence ?? `Segment spans pages ${segment.from}-${overrides.actual?.to}.`,
4340
- hint: overrides.hint ?? "Check maxPages windowing in breakpoint-processor.ts and page constraints.",
4341
- severity: "error",
4342
- type
4343
- };
4344
- default: return {
4345
- ...base,
4346
- severity: "error",
4347
- type
4348
- };
4349
- }
4350
- };
4351
- /**
4352
- * Finds all occurrences of a content string within the joined text.
4353
- * Respects search limits to avoid performance cliffs on highly repetitive content.
4354
- */
4355
- const findJoinedMatches = (content, joined, searchStart, searchEnd, limit = Infinity) => {
4356
- const matches = [];
4357
- if (!content || searchStart >= searchEnd) return matches;
4358
- let idx = joined.indexOf(content, searchStart);
4359
- let count = 0;
4360
- while (idx >= 0 && idx < searchEnd && count < limit) {
4361
- matches.push({
4362
- end: idx + content.length - 1,
4363
- start: idx
4364
- });
4365
- idx = joined.indexOf(content, idx + 1);
4366
- if (idx >= searchEnd) break;
4367
- count++;
4368
- }
4369
- return matches;
4370
- };
4371
- /**
4372
- * Verifies that a matched segment falls within the allowed maxTerms/maxPages constraints.
4373
- * Checks both implicit spans (calculated from match end) and explicit segment.to claims.
4374
- */
4375
- const checkMaxPagesViolation = (segment, segmentIndex, maxPages, matchEnd, _expectedBoundaryEnd, boundaries) => {
4376
- const actualToId = findBoundaryIdForOffset(matchEnd, boundaries);
4377
- if (actualToId === void 0) return [];
4378
- if (maxPages === 0) {
4379
- if (actualToId !== segment.from) return [createIssue("max_pages_violation", segment, segmentIndex, {
4380
- actual: {
4381
- from: segment.from,
4382
- to: actualToId
4383
- },
4384
- evidence: `Segment spans pages ${segment.from}-${actualToId} in joined content (maxPages=0).`,
4385
- expected: {
4386
- from: segment.from,
4387
- to: segment.from
4388
- }
4389
- })];
4390
- }
4391
- if (segment.to !== void 0) {
4392
- if (actualToId > segment.to) return [createIssue("max_pages_violation", segment, segmentIndex, {
4393
- actual: {
4394
- from: segment.from,
4395
- to: actualToId
4396
- },
4397
- evidence: `Segment content ends on page ${actualToId} but segment.to is ${segment.to}.`,
4398
- expected: {
4399
- from: segment.from,
4400
- to: segment.to
4401
- }
4402
- })];
4403
- } else if (maxPages !== void 0) {
4404
- const span = actualToId - segment.from;
4405
- if (span > maxPages) return [createIssue("max_pages_violation", segment, segmentIndex, {
4406
- actual: {
4407
- from: segment.from,
4408
- to: actualToId
4409
- },
4410
- evidence: `Segment spans ${span} pages (maxPages=${maxPages}).`,
4411
- expected: {
4412
- from: segment.from,
4413
- to: segment.from + maxPages
4414
- }
4415
- })];
4416
- }
4417
- return [];
4418
- };
4419
- /**
4420
- * Handles validation when content is not found in the expected boundary window.
4421
- * Fallback strategy: search entire document if segment matches existing content elsewhere.
4422
- */
4423
- const handleMissingBoundary = (segment, segmentIndex, joined, boundaries, pageMap) => {
4424
- const matches = findJoinedMatches(segment.content, joined, 0, joined.length, 1);
4425
- if (matches.length === 0) return [createIssue("content_not_found", segment, segmentIndex, { evidence: "Segment content not found in any page content." }, pageMap)];
4426
- const match = matches[0];
4427
- const actualFromId = findBoundaryIdForOffset(match.start, boundaries);
4428
- const actualToId = findBoundaryIdForOffset(match.end, boundaries);
4429
- return [createIssue("page_attribution_mismatch", segment, segmentIndex, {
4430
- actual: {
4431
- from: segment.from,
4432
- to: segment.to
4433
- },
4434
- evidence: `Content found in joined content at page ${actualFromId}, but segment.from=${segment.from}.`,
4435
- expected: {
4436
- from: actualFromId,
4437
- to: actualToId
4438
- },
4439
- matchIndex: match.start
4440
- }, pageMap)];
4441
- };
4442
- /**
4443
- * Performs a widened search when the direct check fails.
4444
- * Includes a small buffer around the expected position, and optionally a full-document search for short segments.
4445
- */
4446
- const handleFallbackSearch = (segment, segmentIndex, joined, searchStart, searchEnd, expectedBoundary, boundaries, pageMap, maxPages, validationOptions) => {
4447
- const content = segment.content;
4448
- const bufferSize = 1e3;
4449
- const rawMatches = findJoinedMatches(content, joined, Math.max(0, searchStart - bufferSize), Math.min(joined.length, searchEnd + bufferSize), 5);
4450
- if (rawMatches.length === 0) {
4451
- const threshold = validationOptions?.fullSearchThreshold ?? 500;
4452
- if (content.length < threshold) {
4453
- const fullMatches = findJoinedMatches(content, joined, 0, joined.length, 50);
4454
- const validMatch = fullMatches.find((m) => {
4455
- return findBoundaryIdForOffset(m.start, boundaries) === segment.from;
4456
- });
4457
- if (validMatch) return checkMaxPagesViolation(segment, segmentIndex, maxPages, validMatch.end, expectedBoundary.end, boundaries);
4458
- if (fullMatches.length > 0) {
4459
- const match = fullMatches[0];
4460
- const actualFromId = findBoundaryIdForOffset(match.start, boundaries);
4461
- const actualToId = findBoundaryIdForOffset(match.end, boundaries);
4462
- return [createIssue("page_attribution_mismatch", segment, segmentIndex, {
4463
- actual: {
4464
- from: segment.from,
4465
- to: segment.to
4466
- },
4467
- evidence: `Content found in joined content at page ${actualFromId}, but segment.from=${segment.from}.`,
4468
- expected: {
4469
- from: actualFromId,
4470
- to: actualToId
4471
- },
4472
- matchIndex: match.start
4473
- }, pageMap)];
4474
- }
4475
- }
4476
- return [createIssue("content_not_found", segment, segmentIndex, {
4477
- evidence: `Segment content (${content.length} chars) not found in expected window.`,
4478
- hint: "Check page boundary attribution in segmenter.ts."
4479
- }, pageMap)];
4480
- }
4481
- const alignedMatches = rawMatches.filter((m) => m.start >= expectedBoundary.start && m.start <= expectedBoundary.end);
4482
- if (alignedMatches.length > 0) {
4483
- const primary = alignedMatches[0];
4484
- return checkMaxPagesViolation(segment, segmentIndex, maxPages, primary.end, expectedBoundary.end, boundaries);
4485
- }
4486
- const primary = rawMatches[0];
4487
- const actualFromId = findBoundaryIdForOffset(primary.start, boundaries);
4488
- const actualToId = findBoundaryIdForOffset(primary.end, boundaries);
4489
- return [createIssue("page_attribution_mismatch", segment, segmentIndex, {
4490
- actual: {
4491
- from: segment.from,
4492
- to: segment.to
4493
- },
4494
- evidence: `Content found in joined content at page ${actualFromId}, but segment.from=${segment.from}.`,
4495
- expected: {
4496
- from: actualFromId,
4497
- to: actualToId
4498
- },
4499
- matchIndex: primary.start
4500
- }, pageMap)];
4501
- };
4502
- /**
4503
- * Calculates the search range end index based on segment.to or strict bounds.
4504
- */
4505
- const getSearchRange = (segment, expectedBoundary, boundaryMap, joinedLength) => {
4506
- let searchEnd = expectedBoundary.end + 1;
4507
- if (segment.to !== void 0) {
4508
- const endBoundary = boundaryMap.get(segment.to);
4509
- if (endBoundary) searchEnd = endBoundary.end + 1;
4510
- else searchEnd = Math.min(joinedLength, expectedBoundary.end + 5e4);
4511
- }
4512
- return searchEnd;
4513
- };
4514
- /**
4515
- * Validates attribution for a single segment by searching for its content in the joined text.
4516
- * Returns issues if content is missing, mis-attributed, or violates page limits.
4517
- */
4518
- const getAttributionIssues = (segment, segmentIndex, maxPages, joined, boundaries, boundaryMap, pageMap, validationOptions) => {
4519
- if (!segment.content) return [createIssue("content_not_found", segment, segmentIndex, { evidence: "Segment content is empty." }, pageMap)];
4520
- const expectedBoundary = boundaryMap.get(segment.from);
4521
- if (!expectedBoundary) return handleMissingBoundary(segment, segmentIndex, joined, boundaries, pageMap);
4522
- const searchEnd = getSearchRange(segment, expectedBoundary, boundaryMap, joined.length);
4523
- const searchStart = expectedBoundary.start;
4524
- const idx = joined.indexOf(segment.content, searchStart);
4525
- if (idx !== -1 && idx < searchEnd) return checkMaxPagesViolation(segment, segmentIndex, maxPages, idx + segment.content.length - 1, expectedBoundary.end, boundaries);
4526
- return handleFallbackSearch(segment, segmentIndex, joined, searchStart, searchEnd, expectedBoundary, boundaries, pageMap, maxPages, validationOptions);
4527
- };
4528
- /**
4529
- * Performs purely static checks on the segment metadata (Ids and spans) before expensive content searching.
4530
- */
4531
- const checkStaticMaxPages = (segment, index, maxPages) => {
4532
- if (maxPages === void 0 || segment.to === void 0) return null;
4533
- if (maxPages === 0) {
4534
- if (segment.to !== segment.from) return createIssue("max_pages_violation", segment, index, {
4535
- evidence: "maxPages=0 requires all segments to stay within one page.",
4536
- expected: {
4537
- from: segment.from,
4538
- to: segment.from
4539
- },
4540
- hint: "Check boundary detection in breakpoint-utils.ts."
4541
- });
4542
- return null;
4543
- }
4544
- const span = segment.to - segment.from;
4545
- if (span > maxPages) return createIssue("max_pages_violation", segment, index, {
4546
- evidence: `Segment spans ${span} pages (maxPages=${maxPages}).`,
4547
- expected: {
4548
- from: segment.from,
4549
- to: segment.from + maxPages
4550
- },
4551
- hint: "Check breakpoint windowing and page attribution in breakpoint-processor.ts."
4552
- });
4553
- return null;
4554
- };
4555
- /**
4556
- * Validates a list of segments against the source pages.
4557
- * checks for:
4558
- * - Page existence (invalid IDs)
4559
- * - Content fidelity (content must exist in pages)
4560
- * - Page attribution (from/to must match content location)
4561
- * - Page constraints (maxPages violations)
4562
- *
4563
- * @param pages Input pages used for segmentation
4564
- * @param options Operations used during segmentation (for preprocessing/joining consistency)
4565
- * @param segments The output segments to validate
4566
- * @param validationOptions Optional settings for validation behavior
4567
- * @returns A detailed validation report
4568
- */
4569
- const validateSegments = (pages, options, segments, validationOptions) => {
4570
- const normalizedPages = normalizePages(pages, options);
4571
- const { boundaries, joined } = buildJoinedContent(normalizedPages, options.pageJoiner === "newline" ? "\n" : " ");
4572
- const boundaryMap = /* @__PURE__ */ new Map();
4573
- const pageMap = /* @__PURE__ */ new Map();
4574
- for (const b of boundaries) boundaryMap.set(b.id, b);
4575
- for (const p of normalizedPages) pageMap.set(p.id, p);
4576
- const pageIds = new Set(normalizedPages.map((p) => p.id));
4577
- const maxPages = options.maxPages;
4578
- const issues = [];
4579
- for (let i = 0; i < segments.length; i++) {
4580
- const segment = segments[i];
4581
- if (!pageIds.has(segment.from)) {
4582
- issues.push(createIssue("page_not_found", segment, i));
4583
- continue;
4584
- }
4585
- if (segment.to !== void 0 && !pageIds.has(segment.to)) issues.push(createIssue("page_not_found", segment, i, { evidence: `Segment.to=${segment.to} does not exist in input pages.` }));
4586
- const staticMaxPageIssue = checkStaticMaxPages(segment, i, maxPages);
4587
- if (staticMaxPageIssue) issues.push(staticMaxPageIssue);
4588
- const attributionIssues = getAttributionIssues(segment, i, maxPages, joined, boundaries, boundaryMap, pageMap, validationOptions);
4589
- issues.push(...attributionIssues);
4590
- }
4591
- const errors = issues.filter((issue) => issue.severity === "error").length;
4592
- const warnings = issues.filter((issue) => issue.severity === "warn").length;
4593
- return {
4594
- issues,
4595
- ok: issues.length === 0,
4596
- summary: {
4597
- errors,
4598
- issues: issues.length,
4599
- pageCount: pages.length,
4600
- segmentCount: segments.length,
4601
- warnings
4602
- }
4603
- };
4604
- };
4605
- //#endregion
4606
- export { ARABIC_BASE_LETTER_CLASS, ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, ARABIC_MARKS_CLASS, ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, PATTERN_TYPE_KEYS, TOKEN_PATTERNS, Token, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, condenseEllipsis, containsTokens, createArabicDictionaryEntryRule, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, normalizeArabicForComparison, optimizeRules, removeZeroWidth, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateRules, validateSegments, withCapture };
163
+ export { ARABIC_BASE_LETTER_CLASS, ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, ARABIC_MARKS_CLASS, ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, DictionaryProfileValidationError, PATTERN_TYPE_KEYS, TOKEN_PATTERNS, Token, analyzeCommonLineStarts, analyzeDictionaryMarkdownPages, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, classifyDictionaryHeading, condenseEllipsis, containsTokens, createArabicDictionaryEntryRule, detectTokenPatterns, diagnoseDictionaryProfile, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, normalizeArabicForComparison, optimizeRules, removeZeroWidth, scanDictionaryMarkdownPage, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, suggestSegmentationOptions, templateToRegex, validateDictionaryProfile, validateRules, validateSegments, withCapture };
4607
164
 
4608
165
  //# sourceMappingURL=index.mjs.map