flappa-doormal 2.19.0 → 2.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +63 -11
- package/README.md +383 -11
- package/dist/index.d.mts +440 -132
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +2 -4445
- package/dist/index.mjs.map +1 -1
- package/dist/mcp/server.d.mts +1 -0
- package/dist/mcp/server.mjs +156 -0
- package/dist/mcp/server.mjs.map +1 -0
- package/dist/segmentation-advisor-D375TL8-.mjs +6128 -0
- package/dist/segmentation-advisor-D375TL8-.mjs.map +1 -0
- package/package.json +18 -4
package/dist/index.mjs
CHANGED
|
@@ -1,1123 +1,4 @@
|
|
|
1
|
-
|
|
2
|
-
/**
|
|
3
|
-
* Arabic base letters used by low-level dictionary-style regex helpers.
|
|
4
|
-
*
|
|
5
|
-
* This is intentionally broader than `{{harf}}`:
|
|
6
|
-
* - includes standalone hamza `ء`
|
|
7
|
-
* - stays as a raw regex fragment rather than a template token
|
|
8
|
-
*/
|
|
9
|
-
const ARABIC_BASE_LETTER_CLASS = "[ء-غف-ي]";
|
|
10
|
-
/**
|
|
11
|
-
* Arabic combining marks / annotation signs used by low-level regex helpers.
|
|
12
|
-
*/
|
|
13
|
-
const ARABIC_MARKS_CLASS = "[\\u0610-\\u061A\\u0640\\u064B-\\u065F\\u0670\\u06D6-\\u06ED]";
|
|
14
|
-
/**
|
|
15
|
-
* A single Arabic base letter followed by zero or more combining marks.
|
|
16
|
-
*/
|
|
17
|
-
const ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN = `${ARABIC_BASE_LETTER_CLASS}${ARABIC_MARKS_CLASS}*`;
|
|
18
|
-
/**
|
|
19
|
-
* One or more Arabic letters, where each letter may carry combining marks.
|
|
20
|
-
*/
|
|
21
|
-
const ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN = `(?:${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN})+`;
|
|
22
|
-
const ARABIC_SPACED_CODE_ATOM = `[أ-غف-ي]${ARABIC_MARKS_CLASS}*`;
|
|
23
|
-
const RUMUZ_ATOM = `(?:${[
|
|
24
|
-
"تمييز(?![\\u064B-\\u0652\\u0670أ-ي])",
|
|
25
|
-
"خت",
|
|
26
|
-
"خغ",
|
|
27
|
-
"بخ",
|
|
28
|
-
"عخ",
|
|
29
|
-
"مق",
|
|
30
|
-
"مت",
|
|
31
|
-
"عس",
|
|
32
|
-
"سي",
|
|
33
|
-
"سن",
|
|
34
|
-
"كن",
|
|
35
|
-
"مد",
|
|
36
|
-
"قد",
|
|
37
|
-
"خد",
|
|
38
|
-
"فد",
|
|
39
|
-
"دل",
|
|
40
|
-
"كد",
|
|
41
|
-
"غد",
|
|
42
|
-
"صد",
|
|
43
|
-
"دت",
|
|
44
|
-
"دس",
|
|
45
|
-
"تم",
|
|
46
|
-
"فق",
|
|
47
|
-
"دق",
|
|
48
|
-
"[خرزيمنصسدفلتقع](?![\\u064B-\\u0652\\u0670أ-ي])",
|
|
49
|
-
"(?<![\\u0660-\\u0669])٤(?![\\u0660-\\u0669])"
|
|
50
|
-
].join("|")})`;
|
|
51
|
-
const RUMUZ_BLOCK = `${RUMUZ_ATOM}(?:\\s+${RUMUZ_ATOM})*`;
|
|
52
|
-
const BASE_TOKENS = {
|
|
53
|
-
/** Chapter marker (باب). */
|
|
54
|
-
bab: "باب",
|
|
55
|
-
/** Basmala (بسم الله). Also matches ﷽. */
|
|
56
|
-
basmalah: ["بسم الله", "﷽"].join("|"),
|
|
57
|
-
/** Bullet point variants: `•`, `*`, `°`. */
|
|
58
|
-
bullet: "[•*°]",
|
|
59
|
-
/** Dash variants: `-` (U+002D), `–` (U+2013), `—` (U+2014), `ـ` (tatweel U+0640). */
|
|
60
|
-
dash: "[-–—ـ]",
|
|
61
|
-
/** Section marker (فصل / مسألة). */
|
|
62
|
-
fasl: ["مسألة", "فصل"].join("|"),
|
|
63
|
-
/** Single Arabic letter (أ-ي). Does NOT include diacritics. */
|
|
64
|
-
harf: "[أ-ي]",
|
|
65
|
-
/** One or more single Arabic letters separated by spaces, allowing marks/tatweel on each isolated letter (e.g. `د ت س`, `هـ ث`). For multi-letter codes use `{{rumuz}}`. */
|
|
66
|
-
harfs: `${ARABIC_SPACED_CODE_ATOM}(?:\\s+${ARABIC_SPACED_CODE_ATOM})*`,
|
|
67
|
-
/** Horizontal rule / separator: 5+ repeated dashes, underscores, equals, or tatweels. Mixed allowed. */
|
|
68
|
-
hr: "[-–—ـ_=]{5,}",
|
|
69
|
-
/** Book marker (كتاب). */
|
|
70
|
-
kitab: "كتاب",
|
|
71
|
-
/** Hadith transmission phrases (حدثنا, أخبرنا, حدثني, etc.). */
|
|
72
|
-
naql: [
|
|
73
|
-
"حدثني",
|
|
74
|
-
"وأخبرنا",
|
|
75
|
-
"حدثنا",
|
|
76
|
-
"سمعت",
|
|
77
|
-
"أنبأنا",
|
|
78
|
-
"وحدثنا",
|
|
79
|
-
"أخبرنا",
|
|
80
|
-
"وحدثني",
|
|
81
|
-
"وحدثنيه"
|
|
82
|
-
].join("|"),
|
|
83
|
-
/** Newline character. Useful for breakpoints that split on line boundaries. */
|
|
84
|
-
newline: "\\n",
|
|
85
|
-
/** Single ASCII digit (0-9). */
|
|
86
|
-
num: "\\d",
|
|
87
|
-
/** One or more ASCII digits (0-9)+. */
|
|
88
|
-
nums: "\\d+",
|
|
89
|
-
/** Single Arabic-Indic digit (٠-٩, U+0660-U+0669). */
|
|
90
|
-
raqm: "[\\u0660-\\u0669]",
|
|
91
|
-
/** One or more Arabic-Indic digits (٠-٩)+. */
|
|
92
|
-
raqms: "[\\u0660-\\u0669]+",
|
|
93
|
-
/** Rijāl/takhrīj source abbreviations. Matches one or more codes separated by whitespace. */
|
|
94
|
-
rumuz: RUMUZ_BLOCK,
|
|
95
|
-
/** Arabic/common punctuation: `.`, `!`, `?`, `؟`, `؛`. */
|
|
96
|
-
tarqim: "[.!?؟؛]"
|
|
97
|
-
};
|
|
98
|
-
/** Pre-defined token constants for use in patterns. */
|
|
99
|
-
const Token = {
|
|
100
|
-
/** Chapter marker - باب */
|
|
101
|
-
BAB: "{{bab}}",
|
|
102
|
-
/** Basmala - بسم الله */
|
|
103
|
-
BASMALAH: "{{basmalah}}",
|
|
104
|
-
/** Bullet point variants */
|
|
105
|
-
BULLET: "{{bullet}}",
|
|
106
|
-
/** Dash variants (hyphen, en-dash, em-dash, tatweel) */
|
|
107
|
-
DASH: "{{dash}}",
|
|
108
|
-
/** Section marker - فصل / مسألة */
|
|
109
|
-
FASL: "{{fasl}}",
|
|
110
|
-
/** Single Arabic letter */
|
|
111
|
-
HARF: "{{harf}}",
|
|
112
|
-
/** Multiple Arabic letters separated by spaces, allowing marks/tatweel on each isolated letter */
|
|
113
|
-
HARFS: "{{harfs}}",
|
|
114
|
-
/** Horizontal rule / separator (repeated dashes) */
|
|
115
|
-
HR: "{{hr}}",
|
|
116
|
-
/** Book marker - كتاب */
|
|
117
|
-
KITAB: "{{kitab}}",
|
|
118
|
-
/** Hadith transmission phrases */
|
|
119
|
-
NAQL: "{{naql}}",
|
|
120
|
-
/** Newline character (for breakpoints) */
|
|
121
|
-
NEWLINE: "{{newline}}",
|
|
122
|
-
/** Single ASCII digit */
|
|
123
|
-
NUM: "{{num}}",
|
|
124
|
-
/** Composite: {{raqms}} {{dash}} (space) */
|
|
125
|
-
NUMBERED: "{{numbered}}",
|
|
126
|
-
/** One or more ASCII digits */
|
|
127
|
-
NUMS: "{{nums}}",
|
|
128
|
-
/** Single Arabic-Indic digit */
|
|
129
|
-
RAQM: "{{raqm}}",
|
|
130
|
-
/** One or more Arabic-Indic digits */
|
|
131
|
-
RAQMS: "{{raqms}}",
|
|
132
|
-
/** Source abbreviations (rijāl/takhrīj) */
|
|
133
|
-
RUMUZ: "{{rumuz}}",
|
|
134
|
-
/** Punctuation marks */
|
|
135
|
-
TARQIM: "{{tarqim}}"
|
|
136
|
-
};
|
|
137
|
-
/** Wraps a token constant with a named capture: `{{token}}` → `{{token:name}}`. */
|
|
138
|
-
const withCapture = (token, name) => {
|
|
139
|
-
const match = token.match(/^\{\{(\w+)\}\}$/);
|
|
140
|
-
if (!match) return `{{:${name}}}`;
|
|
141
|
-
return `{{${match[1]}:${name}}}`;
|
|
142
|
-
};
|
|
143
|
-
/** Composite tokens that reference base tokens. Pre-expanded at load time. @internal */
|
|
144
|
-
const COMPOSITE_TOKENS = {
|
|
145
|
-
/** Common hadith numbering format: Arabic-Indic digits + dash + space. */
|
|
146
|
-
numbered: "{{raqms}} {{dash}} " };
|
|
147
|
-
/** Expands composite tokens (e.g. `{{numbered}}`) to their underlying template form. */
|
|
148
|
-
const expandCompositeTokensInTemplate = (template) => {
|
|
149
|
-
let out = template;
|
|
150
|
-
for (let i = 0; i < 10; i++) {
|
|
151
|
-
const next = out.replace(/\{\{(\w+)\}\}/g, (m, tokenName) => COMPOSITE_TOKENS[tokenName] ?? m);
|
|
152
|
-
if (next === out) break;
|
|
153
|
-
out = next;
|
|
154
|
-
}
|
|
155
|
-
return out;
|
|
156
|
-
};
|
|
157
|
-
/**
|
|
158
|
-
* Expands base tokens in a template string.
|
|
159
|
-
* Used internally to pre-expand composite tokens.
|
|
160
|
-
*
|
|
161
|
-
* @param template - Template string with `{{token}}` placeholders
|
|
162
|
-
* @returns Expanded pattern with base tokens replaced
|
|
163
|
-
* @internal
|
|
164
|
-
*/
|
|
165
|
-
const expandBaseTokens = (template) => template.replace(/\{\{(\w+)\}\}/g, (_, tokenName) => BASE_TOKENS[tokenName] ?? `{{${tokenName}}}`);
|
|
166
|
-
/**
|
|
167
|
-
* Token definitions mapping human-readable token names to regex patterns.
|
|
168
|
-
*
|
|
169
|
-
* Tokens are used in template strings with double-brace syntax:
|
|
170
|
-
* - `{{token}}` - Expands to the pattern (non-capturing in context)
|
|
171
|
-
* - `{{token:name}}` - Expands to a named capture group `(?<name>pattern)`
|
|
172
|
-
* - `{{:name}}` - Captures any content with the given name `(?<name>.+)`
|
|
173
|
-
*
|
|
174
|
-
* @remarks
|
|
175
|
-
* These patterns are designed for Arabic text matching. For diacritic-insensitive
|
|
176
|
-
* matching of Arabic patterns, use the `fuzzy: true` option in split rules,
|
|
177
|
-
* which applies `makeDiacriticInsensitive()` to the expanded patterns.
|
|
178
|
-
*
|
|
179
|
-
* @example
|
|
180
|
-
* // Using tokens in a split rule
|
|
181
|
-
* { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
|
|
182
|
-
*
|
|
183
|
-
* @example
|
|
184
|
-
* // Using tokens with named captures
|
|
185
|
-
* { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
|
|
186
|
-
*
|
|
187
|
-
* @example
|
|
188
|
-
* // Using the numbered convenience token
|
|
189
|
-
* { lineStartsAfter: ['{{numbered}}'], split: 'at' }
|
|
190
|
-
*/
|
|
191
|
-
const TOKEN_PATTERNS = {
|
|
192
|
-
...BASE_TOKENS,
|
|
193
|
-
...Object.fromEntries(Object.entries(COMPOSITE_TOKENS).map(([k, v]) => [k, expandBaseTokens(v)]))
|
|
194
|
-
};
|
|
195
|
-
/**
|
|
196
|
-
* Regex pattern for matching tokens with optional named capture syntax.
|
|
197
|
-
*
|
|
198
|
-
* Matches:
|
|
199
|
-
* - `{{token}}` - Simple token (group 1 = token name, group 2 = empty)
|
|
200
|
-
* - `{{token:name}}` - Token with capture (group 1 = token, group 2 = name)
|
|
201
|
-
* - `{{:name}}` - Capture-only (group 1 = empty, group 2 = name)
|
|
202
|
-
*
|
|
203
|
-
* @internal
|
|
204
|
-
*/
|
|
205
|
-
const TOKEN_WITH_CAPTURE_REGEX = /\{\{(\w*):?(\w*)\}\}/g;
|
|
206
|
-
/**
|
|
207
|
-
* Regex pattern for simple token matching (no capture syntax).
|
|
208
|
-
*
|
|
209
|
-
* Matches only `{{token}}` format where token is one or more word characters.
|
|
210
|
-
* Used by `containsTokens()` for quick detection.
|
|
211
|
-
*
|
|
212
|
-
* @internal
|
|
213
|
-
*/
|
|
214
|
-
const SIMPLE_TOKEN_REGEX = /\{\{(\w+)\}\}/g;
|
|
215
|
-
/**
|
|
216
|
-
* Checks if a query string contains template tokens.
|
|
217
|
-
*
|
|
218
|
-
* Performs a quick test for `{{token}}` patterns without actually
|
|
219
|
-
* expanding them. Useful for determining whether to apply token
|
|
220
|
-
* expansion to a string.
|
|
221
|
-
*
|
|
222
|
-
* @param query - String to check for tokens
|
|
223
|
-
* @returns `true` if the string contains at least one `{{token}}` pattern
|
|
224
|
-
*
|
|
225
|
-
* @example
|
|
226
|
-
* containsTokens('{{raqms}} {{dash}}') // → true
|
|
227
|
-
* containsTokens('plain text') // → false
|
|
228
|
-
* containsTokens('[٠-٩]+ - ') // → false (raw regex, no tokens)
|
|
229
|
-
*/
|
|
230
|
-
const containsTokens = (query) => {
|
|
231
|
-
SIMPLE_TOKEN_REGEX.lastIndex = 0;
|
|
232
|
-
return SIMPLE_TOKEN_REGEX.test(query);
|
|
233
|
-
};
|
|
234
|
-
const splitTemplateIntoSegments = (query) => {
|
|
235
|
-
const segments = [];
|
|
236
|
-
let lastIndex = 0;
|
|
237
|
-
TOKEN_WITH_CAPTURE_REGEX.lastIndex = 0;
|
|
238
|
-
for (const match of query.matchAll(TOKEN_WITH_CAPTURE_REGEX)) {
|
|
239
|
-
if (match.index > lastIndex) segments.push({
|
|
240
|
-
type: "text",
|
|
241
|
-
value: query.slice(lastIndex, match.index)
|
|
242
|
-
});
|
|
243
|
-
segments.push({
|
|
244
|
-
type: "token",
|
|
245
|
-
value: match[0]
|
|
246
|
-
});
|
|
247
|
-
lastIndex = match.index + match[0].length;
|
|
248
|
-
}
|
|
249
|
-
if (lastIndex < query.length) segments.push({
|
|
250
|
-
type: "text",
|
|
251
|
-
value: query.slice(lastIndex)
|
|
252
|
-
});
|
|
253
|
-
return segments;
|
|
254
|
-
};
|
|
255
|
-
const maybeApplyFuzzyToText = (text, fuzzyTransform) => fuzzyTransform && /[\u0600-\u06FF]/u.test(text) ? fuzzyTransform(text) : text;
|
|
256
|
-
const maybeApplyFuzzyToTokenPattern = (tokenPattern, fuzzyTransform) => !fuzzyTransform ? tokenPattern : tokenPattern.split("|").map((part) => /[\u0600-\u06FF]/u.test(part) ? fuzzyTransform(part) : part).join("|");
|
|
257
|
-
const parseTokenLiteral = (literal) => {
|
|
258
|
-
TOKEN_WITH_CAPTURE_REGEX.lastIndex = 0;
|
|
259
|
-
const m = TOKEN_WITH_CAPTURE_REGEX.exec(literal);
|
|
260
|
-
return m ? {
|
|
261
|
-
captureName: m[2],
|
|
262
|
-
tokenName: m[1]
|
|
263
|
-
} : null;
|
|
264
|
-
};
|
|
265
|
-
const createCaptureRegistry = (capturePrefix) => {
|
|
266
|
-
const captureNames = [];
|
|
267
|
-
const captureNameCounts = /* @__PURE__ */ new Map();
|
|
268
|
-
const register = (baseName) => {
|
|
269
|
-
const count = captureNameCounts.get(baseName) ?? 0;
|
|
270
|
-
captureNameCounts.set(baseName, count + 1);
|
|
271
|
-
const uniqueName = count === 0 ? baseName : `${baseName}_${count + 1}`;
|
|
272
|
-
const prefixedName = capturePrefix ? `${capturePrefix}${uniqueName}` : uniqueName;
|
|
273
|
-
captureNames.push(prefixedName);
|
|
274
|
-
return prefixedName;
|
|
275
|
-
};
|
|
276
|
-
return {
|
|
277
|
-
captureNames,
|
|
278
|
-
register
|
|
279
|
-
};
|
|
280
|
-
};
|
|
281
|
-
const expandTokenLiteral = (literal, opts) => {
|
|
282
|
-
const parsed = parseTokenLiteral(literal);
|
|
283
|
-
if (!parsed) return literal;
|
|
284
|
-
const { tokenName, captureName } = parsed;
|
|
285
|
-
if (!tokenName && captureName) return `(?<${opts.registerCapture(captureName)}>.+)`;
|
|
286
|
-
let tokenPattern = TOKEN_PATTERNS[tokenName];
|
|
287
|
-
if (!tokenPattern) return literal;
|
|
288
|
-
tokenPattern = maybeApplyFuzzyToTokenPattern(tokenPattern, opts.fuzzyTransform);
|
|
289
|
-
if (captureName) return `(?<${opts.registerCapture(captureName)}>${tokenPattern})`;
|
|
290
|
-
return tokenPattern;
|
|
291
|
-
};
|
|
292
|
-
/**
|
|
293
|
-
* Expands template tokens with support for named captures.
|
|
294
|
-
*
|
|
295
|
-
* This is the primary token expansion function that handles all token syntax:
|
|
296
|
-
* - `{{token}}` → Expands to the token's pattern (no capture group)
|
|
297
|
-
* - `{{token:name}}` → Expands to `(?<name>pattern)` (named capture)
|
|
298
|
-
* - `{{:name}}` → Expands to `(?<name>.+)` (capture anything)
|
|
299
|
-
*
|
|
300
|
-
* Unknown tokens are left as-is in the output, allowing for partial templates.
|
|
301
|
-
*
|
|
302
|
-
* @param query - The template string containing tokens
|
|
303
|
-
* @param fuzzyTransform - Optional function to transform Arabic text for fuzzy matching.
|
|
304
|
-
* Applied to both token patterns and plain Arabic text between tokens.
|
|
305
|
-
* Typically `makeDiacriticInsensitive` from the fuzzy module.
|
|
306
|
-
* @returns Object with expanded pattern, capture names, and capture flag
|
|
307
|
-
*
|
|
308
|
-
* @example
|
|
309
|
-
* // Simple token expansion
|
|
310
|
-
* expandTokensWithCaptures('{{raqms}} {{dash}}')
|
|
311
|
-
* // → { pattern: '[\\u0660-\\u0669]+ [-–—ـ]', captureNames: [], hasCaptures: false }
|
|
312
|
-
*
|
|
313
|
-
* @example
|
|
314
|
-
* // Named capture
|
|
315
|
-
* expandTokensWithCaptures('{{raqms:num}} {{dash}}')
|
|
316
|
-
* // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
|
|
317
|
-
*
|
|
318
|
-
* @example
|
|
319
|
-
* // Capture-only token
|
|
320
|
-
* expandTokensWithCaptures('{{raqms:num}} {{dash}} {{:content}}')
|
|
321
|
-
* // → { pattern: '(?<num>[٠-٩]+) [-–—ـ] (?<content>.+)', captureNames: ['num', 'content'], hasCaptures: true }
|
|
322
|
-
*
|
|
323
|
-
* @example
|
|
324
|
-
* // With fuzzy transform
|
|
325
|
-
* expandTokensWithCaptures('{{bab}}', makeDiacriticInsensitive)
|
|
326
|
-
* // → { pattern: 'بَ?ا?بٌ?', captureNames: [], hasCaptures: false }
|
|
327
|
-
*/
|
|
328
|
-
const expandTokensWithCaptures = (query, fuzzyTransform, capturePrefix) => {
|
|
329
|
-
const segments = splitTemplateIntoSegments(query);
|
|
330
|
-
const registry = createCaptureRegistry(capturePrefix);
|
|
331
|
-
const pattern = segments.map((segment) => segment.type === "text" ? maybeApplyFuzzyToText(segment.value, fuzzyTransform) : expandTokenLiteral(segment.value, {
|
|
332
|
-
capturePrefix,
|
|
333
|
-
fuzzyTransform,
|
|
334
|
-
registerCapture: registry.register
|
|
335
|
-
})).join("");
|
|
336
|
-
return {
|
|
337
|
-
captureNames: registry.captureNames,
|
|
338
|
-
hasCaptures: registry.captureNames.length > 0,
|
|
339
|
-
pattern
|
|
340
|
-
};
|
|
341
|
-
};
|
|
342
|
-
/**
|
|
343
|
-
* Expands template tokens in a query string to their regex equivalents.
|
|
344
|
-
*
|
|
345
|
-
* This is the simple version without capture support. It returns only the
|
|
346
|
-
* expanded pattern string, not capture metadata.
|
|
347
|
-
*
|
|
348
|
-
* Unknown tokens are left as-is, allowing for partial templates.
|
|
349
|
-
*
|
|
350
|
-
* @param query - Template string containing `{{token}}` placeholders
|
|
351
|
-
* @returns Expanded regex pattern string
|
|
352
|
-
*
|
|
353
|
-
* @example
|
|
354
|
-
* expandTokens('، {{raqms}}') // → '، [\\u0660-\\u0669]+'
|
|
355
|
-
* expandTokens('{{raqm}}*') // → '[\\u0660-\\u0669]*'
|
|
356
|
-
* expandTokens('{{dash}}{{raqm}}') // → '[-–—ـ][\\u0660-\\u0669]'
|
|
357
|
-
* expandTokens('{{unknown}}') // → '{{unknown}}' (left as-is)
|
|
358
|
-
*
|
|
359
|
-
* @see expandTokensWithCaptures for full capture group support
|
|
360
|
-
*/
|
|
361
|
-
const expandTokens = (query) => expandTokensWithCaptures(query).pattern;
|
|
362
|
-
/**
|
|
363
|
-
* Converts a template string to a compiled RegExp.
|
|
364
|
-
*
|
|
365
|
-
* Expands all tokens and attempts to compile the result as a RegExp
|
|
366
|
-
* with Unicode flag. Returns `null` if the resulting pattern is invalid.
|
|
367
|
-
*
|
|
368
|
-
* @remarks
|
|
369
|
-
* This function dynamically compiles regular expressions from template strings.
|
|
370
|
-
* If templates may come from untrusted sources, be aware of potential ReDoS
|
|
371
|
-
* (Regular Expression Denial of Service) risks due to catastrophic backtracking.
|
|
372
|
-
* Consider validating pattern complexity or applying execution timeouts when
|
|
373
|
-
* running user-submitted patterns.
|
|
374
|
-
*
|
|
375
|
-
* @param template - Template string containing `{{token}}` placeholders
|
|
376
|
-
* @returns Compiled RegExp with 'u' flag, or `null` if invalid
|
|
377
|
-
*
|
|
378
|
-
* @example
|
|
379
|
-
* templateToRegex('، {{raqms}}') // → /، [٠-٩]+/u
|
|
380
|
-
* templateToRegex('{{raqms}}+') // → /[٠-٩]++/u (might be invalid in some engines)
|
|
381
|
-
* templateToRegex('(((') // → null (invalid regex)
|
|
382
|
-
*/
|
|
383
|
-
const templateToRegex = (template) => {
|
|
384
|
-
const expanded = expandTokens(template);
|
|
385
|
-
try {
|
|
386
|
-
return new RegExp(expanded, "u");
|
|
387
|
-
} catch {
|
|
388
|
-
return null;
|
|
389
|
-
}
|
|
390
|
-
};
|
|
391
|
-
/**
|
|
392
|
-
* Lists all available token names defined in `TOKEN_PATTERNS`.
|
|
393
|
-
*
|
|
394
|
-
* Useful for documentation, validation, or building user interfaces
|
|
395
|
-
* that show available tokens.
|
|
396
|
-
*
|
|
397
|
-
* @returns Array of token names (e.g., `['bab', 'basmalah', 'bullet', ...]`)
|
|
398
|
-
*
|
|
399
|
-
* @example
|
|
400
|
-
* getAvailableTokens()
|
|
401
|
-
* // → ['bab', 'basmalah', 'bullet', 'dash', 'harf', 'kitab', 'naql', 'raqm', 'raqms']
|
|
402
|
-
*/
|
|
403
|
-
const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
|
|
404
|
-
/**
|
|
405
|
-
* Gets the regex pattern for a specific token name.
|
|
406
|
-
*
|
|
407
|
-
* Returns the raw pattern string as defined in `TOKEN_PATTERNS`,
|
|
408
|
-
* without any expansion or capture group wrapping.
|
|
409
|
-
*
|
|
410
|
-
* @param tokenName - The token name to look up (e.g., `'raqms'`, `'dash'`, `'harfs'`)
|
|
411
|
-
* @returns The regex pattern string for that known token
|
|
412
|
-
*
|
|
413
|
-
* @example
|
|
414
|
-
* getTokenPattern('raqms') // → '[\\u0660-\\u0669]+'
|
|
415
|
-
* getTokenPattern('dash') // → '[-–—ـ]'
|
|
416
|
-
* getTokenPattern('harfs') // → pattern for spaced isolated Arabic letter codes
|
|
417
|
-
*/
|
|
418
|
-
const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
|
|
419
|
-
/**
|
|
420
|
-
* Regex to detect fuzzy-default tokens in a pattern string.
|
|
421
|
-
* Matches {{token}} or {{token:name}} syntax.
|
|
422
|
-
*/
|
|
423
|
-
const FUZZY_TOKEN_REGEX = new RegExp(`\\{\\{(?:${[
|
|
424
|
-
"bab",
|
|
425
|
-
"basmalah",
|
|
426
|
-
"fasl",
|
|
427
|
-
"kitab",
|
|
428
|
-
"naql"
|
|
429
|
-
].join("|")})(?::\\w+)?\\}\\}`, "g");
|
|
430
|
-
/**
|
|
431
|
-
* Checks if a pattern (or array of patterns) contains tokens that should
|
|
432
|
-
* default to fuzzy matching.
|
|
433
|
-
*
|
|
434
|
-
* Fuzzy-default tokens are: bab, basmalah, fasl, kitab, naql
|
|
435
|
-
*
|
|
436
|
-
* @param patterns - Single pattern string or array of pattern strings
|
|
437
|
-
* @returns `true` if any pattern contains a fuzzy-default token
|
|
438
|
-
*
|
|
439
|
-
* @example
|
|
440
|
-
* shouldDefaultToFuzzy('{{bab}} الإيمان') // true
|
|
441
|
-
* shouldDefaultToFuzzy('{{raqms}} {{dash}}') // false
|
|
442
|
-
* shouldDefaultToFuzzy(['{{kitab}}', '{{raqms}}']) // true
|
|
443
|
-
*/
|
|
444
|
-
const shouldDefaultToFuzzy = (patterns) => {
|
|
445
|
-
return (Array.isArray(patterns) ? patterns : [patterns]).some((p) => {
|
|
446
|
-
FUZZY_TOKEN_REGEX.lastIndex = 0;
|
|
447
|
-
return FUZZY_TOKEN_REGEX.test(p);
|
|
448
|
-
});
|
|
449
|
-
};
|
|
450
|
-
/**
|
|
451
|
-
* Apply token mappings to a template string.
|
|
452
|
-
*
|
|
453
|
-
* Transforms `{{token}}` into `{{token:name}}` based on the provided mappings.
|
|
454
|
-
* Useful for applying user-configured capture names to a raw template.
|
|
455
|
-
*
|
|
456
|
-
* - Only affects exact matches of `{{token}}`.
|
|
457
|
-
* - Does NOT affect tokens that already have a capture name (e.g. `{{token:existing}}`).
|
|
458
|
-
* - Does NOT affect capture-only tokens (e.g. `{{:name}}`).
|
|
459
|
-
*
|
|
460
|
-
* @param template - The template string to transform
|
|
461
|
-
* @param mappings - Array of mappings from token name to capture name
|
|
462
|
-
* @returns Transformed template string with captures applied
|
|
463
|
-
*
|
|
464
|
-
* @example
|
|
465
|
-
* applyTokenMappings('{{raqms}} {{dash}}', [{ token: 'raqms', name: 'num' }])
|
|
466
|
-
* // → '{{raqms:num}} {{dash}}'
|
|
467
|
-
*/
|
|
468
|
-
const applyTokenMappings = (template, mappings) => {
|
|
469
|
-
let result = template;
|
|
470
|
-
for (const { token, name } of mappings) {
|
|
471
|
-
if (!token || !name) continue;
|
|
472
|
-
const regex = new RegExp(`\\{\\{${token}\\}\\}`, "g");
|
|
473
|
-
result = result.replace(regex, `{{${token}:${name}}}`);
|
|
474
|
-
}
|
|
475
|
-
return result;
|
|
476
|
-
};
|
|
477
|
-
/**
|
|
478
|
-
* Strip token mappings from a template string.
|
|
479
|
-
*
|
|
480
|
-
* Transforms `{{token:name}}` back into `{{token}}`.
|
|
481
|
-
* Also transforms `{{:name}}` patterns (capture-only) into `{{}}` (which is invalid/empty).
|
|
482
|
-
*
|
|
483
|
-
* Useful for normalizing templates for storage or comparison.
|
|
484
|
-
*
|
|
485
|
-
* @param template - The template string to strip
|
|
486
|
-
* @returns Template string with capture names removed
|
|
487
|
-
*
|
|
488
|
-
* @example
|
|
489
|
-
* stripTokenMappings('{{raqms:num}} {{dash}}')
|
|
490
|
-
* // → '{{raqms}} {{dash}}'
|
|
491
|
-
*/
|
|
492
|
-
const stripTokenMappings = (template) => {
|
|
493
|
-
return template.replace(/\{\{([^:}]+):[^}]+\}\}/g, "{{$1}}");
|
|
494
|
-
};
|
|
495
|
-
//#endregion
|
|
496
|
-
//#region src/utils/textUtils.ts
|
|
497
|
-
/**
|
|
498
|
-
* Normalizes line endings to Unix-style (`\n`).
|
|
499
|
-
*
|
|
500
|
-
* Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
|
|
501
|
-
* for consistent pattern matching across platforms.
|
|
502
|
-
*
|
|
503
|
-
* @param content - Raw content with potentially mixed line endings
|
|
504
|
-
* @returns Content with all line endings normalized to `\n`
|
|
505
|
-
*/
|
|
506
|
-
const normalizeLineEndings = (content) => {
|
|
507
|
-
return content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
|
|
508
|
-
};
|
|
509
|
-
/**
|
|
510
|
-
* Escapes regex metacharacters (parentheses and brackets) in template patterns,
|
|
511
|
-
* but preserves content inside `{{...}}` token delimiters.
|
|
512
|
-
*
|
|
513
|
-
* This allows users to write intuitive patterns like `({{harf}}):` instead of
|
|
514
|
-
* the verbose `\\({{harf}}\\):`. The escaping is applied BEFORE token expansion,
|
|
515
|
-
* so tokens like `{{harf}}` which expand to `[أ-ي]` work correctly.
|
|
516
|
-
*
|
|
517
|
-
* @param pattern - Template pattern that may contain `()[]` and `{{tokens}}`
|
|
518
|
-
* @returns Pattern with `()[]` escaped outside of `{{...}}` delimiters
|
|
519
|
-
*
|
|
520
|
-
* @example
|
|
521
|
-
* escapeTemplateBrackets('({{harf}}): ')
|
|
522
|
-
* // → '\\({{harf}}\\): '
|
|
523
|
-
*
|
|
524
|
-
* @example
|
|
525
|
-
* escapeTemplateBrackets('[{{raqm}}] ')
|
|
526
|
-
* // → '\\[{{raqm}}\\] '
|
|
527
|
-
*
|
|
528
|
-
* @example
|
|
529
|
-
* escapeTemplateBrackets('{{harf}}')
|
|
530
|
-
* // → '{{harf}}' (unchanged - no brackets outside tokens)
|
|
531
|
-
*/
|
|
532
|
-
const escapeTemplateBrackets = (pattern) => {
|
|
533
|
-
return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (_match, token, bracket) => token || `\\${bracket}`);
|
|
534
|
-
};
|
|
535
|
-
/**
|
|
536
|
-
* Character class matching all Arabic diacritics (Tashkeel/Harakat).
|
|
537
|
-
*
|
|
538
|
-
* Includes the following diacritical marks:
|
|
539
|
-
* - U+0640: ـ (tatweel / kashida)
|
|
540
|
-
* - U+064B: ً (fathatan - double fatha)
|
|
541
|
-
* - U+064C: ٌ (dammatan - double damma)
|
|
542
|
-
* - U+064D: ٍ (kasratan - double kasra)
|
|
543
|
-
* - U+064E: َ (fatha - short a)
|
|
544
|
-
* - U+064F: ُ (damma - short u)
|
|
545
|
-
* - U+0650: ِ (kasra - short i)
|
|
546
|
-
* - U+0651: ّ (shadda - gemination)
|
|
547
|
-
* - U+0652: ْ (sukun - no vowel)
|
|
548
|
-
*
|
|
549
|
-
* @internal
|
|
550
|
-
*/
|
|
551
|
-
const DIACRITICS_CLASS = "[ـًٌٍَُِّْ]";
|
|
552
|
-
/**
|
|
553
|
-
* Groups of equivalent Arabic characters.
|
|
554
|
-
*
|
|
555
|
-
* Characters within the same group are considered equivalent for matching purposes.
|
|
556
|
-
* This handles common variations in Arabic text where different characters are
|
|
557
|
-
* used interchangeably or have the same underlying meaning.
|
|
558
|
-
*
|
|
559
|
-
* Equivalence groups:
|
|
560
|
-
* - Alef variants: ا (bare), آ (with madda), أ (with hamza above), إ (with hamza below)
|
|
561
|
-
* - Ta marbuta and Ha: ة ↔ ه (often interchangeable at word endings)
|
|
562
|
-
* - Alef maqsura and Ya: ى ↔ ي (often interchangeable at word endings)
|
|
563
|
-
*
|
|
564
|
-
* @internal
|
|
565
|
-
*/
|
|
566
|
-
const EQUIV_GROUPS = [
|
|
567
|
-
[
|
|
568
|
-
"ا",
|
|
569
|
-
"آ",
|
|
570
|
-
"أ",
|
|
571
|
-
"إ"
|
|
572
|
-
],
|
|
573
|
-
["ة", "ه"],
|
|
574
|
-
["ى", "ي"]
|
|
575
|
-
];
|
|
576
|
-
const DIACRITICS_AND_MARKS_REGEX = new RegExp(ARABIC_MARKS_CLASS, "g");
|
|
577
|
-
/**
|
|
578
|
-
* Escapes a string for safe inclusion in a regular expression.
|
|
579
|
-
*
|
|
580
|
-
* Escapes all regex metacharacters: `.*+?^${}()|[\]\\`
|
|
581
|
-
*
|
|
582
|
-
* @param s - Any string to escape
|
|
583
|
-
* @returns String with regex metacharacters escaped
|
|
584
|
-
*
|
|
585
|
-
* @example
|
|
586
|
-
* escapeRegex('hello.world') // → 'hello\\.world'
|
|
587
|
-
* escapeRegex('[test]') // → '\\[test\\]'
|
|
588
|
-
* escapeRegex('a+b*c?') // → 'a\\+b\\*c\\?'
|
|
589
|
-
*/
|
|
590
|
-
const escapeRegex = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
591
|
-
const getEquivClass = (ch) => {
|
|
592
|
-
const group = EQUIV_GROUPS.find((g) => g.includes(ch));
|
|
593
|
-
return group ? `[${group.map(escapeRegex).join("")}]` : escapeRegex(ch);
|
|
594
|
-
};
|
|
595
|
-
const normalizeArabicLight = (str) => {
|
|
596
|
-
return str.normalize("NFC").replace(/[\u200C\u200D]/g, "").replace(/\s+/g, " ").trim();
|
|
597
|
-
};
|
|
598
|
-
/**
|
|
599
|
-
* Normalizes Arabic text for exact comparisons while tolerating common variants.
|
|
600
|
-
*
|
|
601
|
-
* This removes Arabic diacritics, collapses whitespace, removes joiners, and
|
|
602
|
-
* maps common equivalent letters to a shared canonical form:
|
|
603
|
-
* - ا/آ/أ/إ -> ا
|
|
604
|
-
* - ة/ه -> ه
|
|
605
|
-
* - ى/ي -> ي
|
|
606
|
-
*/
|
|
607
|
-
const normalizeArabicForComparison = (text) => {
|
|
608
|
-
return Array.from(normalizeArabicLight(text).replace(DIACRITICS_AND_MARKS_REGEX, "")).map((ch) => {
|
|
609
|
-
if (ch === "آ" || ch === "أ" || ch === "إ") return "ا";
|
|
610
|
-
if (ch === "ة") return "ه";
|
|
611
|
-
if (ch === "ى") return "ي";
|
|
612
|
-
return ch;
|
|
613
|
-
}).join("");
|
|
614
|
-
};
|
|
615
|
-
const makeDiacriticInsensitive = (text) => {
|
|
616
|
-
const diacriticsMatcher = `${DIACRITICS_CLASS}*`;
|
|
617
|
-
return Array.from(normalizeArabicLight(text)).map((ch) => getEquivClass(ch) + diacriticsMatcher).join("");
|
|
618
|
-
};
|
|
619
|
-
const isCombiningMarkOrSelector = (char) => {
|
|
620
|
-
if (!char) return false;
|
|
621
|
-
return /\p{M}/u.test(char) || char === "︎" || char === "️";
|
|
622
|
-
};
|
|
623
|
-
const isJoiner = (char) => char === "" || char === "";
|
|
624
|
-
/**
|
|
625
|
-
* Ensures the position does not split a grapheme cluster (surrogate pairs,
|
|
626
|
-
* combining marks, or zero-width joiners / variation selectors).
|
|
627
|
-
*
|
|
628
|
-
* This is only used as a last-resort fallback when we are forced to split
|
|
629
|
-
* near a hard limit (e.g. maxContentLength with no safe whitespace/punctuation).
|
|
630
|
-
*/
|
|
631
|
-
const adjustForUnicodeBoundary = (content, position) => {
|
|
632
|
-
let adjusted = position;
|
|
633
|
-
while (adjusted > 0) {
|
|
634
|
-
const high = content.charCodeAt(adjusted - 1);
|
|
635
|
-
const low = content.charCodeAt(adjusted);
|
|
636
|
-
if (high >= 55296 && high <= 56319 && low >= 56320 && low <= 57343) {
|
|
637
|
-
adjusted -= 1;
|
|
638
|
-
continue;
|
|
639
|
-
}
|
|
640
|
-
const nextChar = content[adjusted];
|
|
641
|
-
const prevChar = content[adjusted - 1];
|
|
642
|
-
if (isCombiningMarkOrSelector(nextChar) || isJoiner(nextChar) || isJoiner(prevChar)) {
|
|
643
|
-
adjusted -= 1;
|
|
644
|
-
continue;
|
|
645
|
-
}
|
|
646
|
-
break;
|
|
647
|
-
}
|
|
648
|
-
return adjusted;
|
|
649
|
-
};
|
|
650
|
-
//#endregion
|
|
651
|
-
//#region src/analysis/shared.ts
|
|
652
|
-
const escapeSignatureLiteral = (s) => s.replace(/[.*+?^${}|\\{}]/g, "\\$&");
|
|
653
|
-
const TOKEN_PRIORITY_ORDER$1 = [
|
|
654
|
-
"basmalah",
|
|
655
|
-
"kitab",
|
|
656
|
-
"bab",
|
|
657
|
-
"fasl",
|
|
658
|
-
"naql",
|
|
659
|
-
"rumuz",
|
|
660
|
-
"numbered",
|
|
661
|
-
"raqms",
|
|
662
|
-
"raqm",
|
|
663
|
-
"dash",
|
|
664
|
-
"bullet",
|
|
665
|
-
"tarqim"
|
|
666
|
-
];
|
|
667
|
-
const buildTokenPriority = () => {
|
|
668
|
-
const allTokens = new Set(getAvailableTokens());
|
|
669
|
-
return TOKEN_PRIORITY_ORDER$1.filter((t) => allTokens.has(t));
|
|
670
|
-
};
|
|
671
|
-
const collapseWhitespace = (s) => s.replace(/\s+/g, " ").trim();
|
|
672
|
-
const stripArabicDiacritics = (s) => s.replace(/[\u064B-\u065F\u0670\u06D6-\u06ED]/gu, "");
|
|
673
|
-
const compileTokenRegexes = (tokenNames) => tokenNames.map((token) => {
|
|
674
|
-
const pat = TOKEN_PATTERNS[token];
|
|
675
|
-
if (!pat) return null;
|
|
676
|
-
try {
|
|
677
|
-
return {
|
|
678
|
-
re: new RegExp(pat, "uy"),
|
|
679
|
-
token
|
|
680
|
-
};
|
|
681
|
-
} catch {
|
|
682
|
-
return null;
|
|
683
|
-
}
|
|
684
|
-
}).filter((x) => x !== null);
|
|
685
|
-
const appendWs = (out, mode) => {
|
|
686
|
-
if (!out) return out;
|
|
687
|
-
const suffix = mode === "space" ? " " : "\\s*";
|
|
688
|
-
return out.endsWith(suffix) ? out : `${out}${suffix}`;
|
|
689
|
-
};
|
|
690
|
-
const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter) => {
|
|
691
|
-
let best = null;
|
|
692
|
-
for (const { token, re } of compiled) {
|
|
693
|
-
re.lastIndex = pos;
|
|
694
|
-
const m = re.exec(s);
|
|
695
|
-
if (!m || m.index !== pos) continue;
|
|
696
|
-
if (!best || m[0].length > best.text.length) best = {
|
|
697
|
-
text: m[0],
|
|
698
|
-
token
|
|
699
|
-
};
|
|
700
|
-
}
|
|
701
|
-
if (best?.token === "rumuz") {
|
|
702
|
-
const end = pos + best.text.length;
|
|
703
|
-
const next = end < s.length ? s[end] : "";
|
|
704
|
-
if (next && isArabicLetter(next) && !/\s/u.test(next)) return null;
|
|
705
|
-
}
|
|
706
|
-
return best;
|
|
707
|
-
};
|
|
708
|
-
const isArabicLetter = (ch) => /\p{Script=Arabic}/u.test(ch) && /\p{L}/u.test(ch);
|
|
709
|
-
const isCommonDelimiter = (ch) => /[::\-–—ـ،؛.?!؟()[\]{}]/u.test(ch);
|
|
710
|
-
//#endregion
|
|
711
|
-
//#region src/analysis/line-starts.ts
|
|
712
|
-
const resolveOptions$1 = (options = {}) => ({
|
|
713
|
-
includeFirstWordFallback: options.includeFirstWordFallback ?? true,
|
|
714
|
-
lineFilter: options.lineFilter,
|
|
715
|
-
maxExamples: options.maxExamples ?? 1,
|
|
716
|
-
minCount: options.minCount ?? 3,
|
|
717
|
-
minLineLength: options.minLineLength ?? 6,
|
|
718
|
-
normalizeArabicDiacritics: options.normalizeArabicDiacritics ?? true,
|
|
719
|
-
prefixChars: options.prefixChars ?? 60,
|
|
720
|
-
prefixMatchers: options.prefixMatchers ?? [/^#+/u],
|
|
721
|
-
sortBy: options.sortBy ?? "specificity",
|
|
722
|
-
topK: options.topK ?? 40,
|
|
723
|
-
whitespace: options.whitespace ?? "regex"
|
|
724
|
-
});
|
|
725
|
-
const countTokenMarkers = (pattern) => (pattern.match(/\{\{/g) ?? []).length;
|
|
726
|
-
const computeSpecificity = (pattern) => ({
|
|
727
|
-
literalLen: pattern.replace(/\\s\*/g, "").replace(/[ \t]+/g, "").length,
|
|
728
|
-
tokenCount: countTokenMarkers(pattern)
|
|
729
|
-
});
|
|
730
|
-
const compareBySpecificity = (a, b) => {
|
|
731
|
-
const sa = computeSpecificity(a.pattern), sb = computeSpecificity(b.pattern);
|
|
732
|
-
return sb.tokenCount - sa.tokenCount || sb.literalLen - sa.literalLen || b.count - a.count || a.pattern.localeCompare(b.pattern);
|
|
733
|
-
};
|
|
734
|
-
const compareByCount = (a, b) => b.count !== a.count ? b.count - a.count : compareBySpecificity(a, b);
|
|
735
|
-
const appendPrefix = (s, pos, out, matchers, ws) => {
|
|
736
|
-
for (const re of matchers) {
|
|
737
|
-
if (pos >= s.length) break;
|
|
738
|
-
const m = re.exec(s.slice(pos));
|
|
739
|
-
if (!m?.index && m?.[0]) {
|
|
740
|
-
out += escapeSignatureLiteral(m[0]);
|
|
741
|
-
pos += m[0].length;
|
|
742
|
-
const wsm = /^[ \t]+/u.exec(s.slice(pos));
|
|
743
|
-
if (wsm) {
|
|
744
|
-
pos += wsm[0].length;
|
|
745
|
-
out = appendWs(out, ws);
|
|
746
|
-
}
|
|
747
|
-
return {
|
|
748
|
-
matched: true,
|
|
749
|
-
out,
|
|
750
|
-
pos
|
|
751
|
-
};
|
|
752
|
-
}
|
|
753
|
-
}
|
|
754
|
-
return {
|
|
755
|
-
matched: false,
|
|
756
|
-
out,
|
|
757
|
-
pos
|
|
758
|
-
};
|
|
759
|
-
};
|
|
760
|
-
const appendToken = (s, pos, out, compiled) => {
|
|
761
|
-
const best = findBestTokenMatchAt(s, pos, compiled, isArabicLetter);
|
|
762
|
-
return best ? {
|
|
763
|
-
matched: true,
|
|
764
|
-
out: `${out}{{${best.token}}}`,
|
|
765
|
-
pos: pos + best.text.length
|
|
766
|
-
} : {
|
|
767
|
-
matched: false,
|
|
768
|
-
out,
|
|
769
|
-
pos
|
|
770
|
-
};
|
|
771
|
-
};
|
|
772
|
-
const appendDelimiter = (s, pos, out) => {
|
|
773
|
-
const ch = s[pos];
|
|
774
|
-
return ch && isCommonDelimiter(ch) ? {
|
|
775
|
-
matched: true,
|
|
776
|
-
out: `${out}${escapeSignatureLiteral(ch)}`,
|
|
777
|
-
pos: pos + 1
|
|
778
|
-
} : {
|
|
779
|
-
matched: false,
|
|
780
|
-
out,
|
|
781
|
-
pos
|
|
782
|
-
};
|
|
783
|
-
};
|
|
784
|
-
const appendFallbackWord = (s, pos, out) => {
|
|
785
|
-
const word = extractFirstWord(s.slice(pos));
|
|
786
|
-
return word ? `${out}${escapeSignatureLiteral(word)}` : null;
|
|
787
|
-
};
|
|
788
|
-
const consumeLineStartStep = (s, pos, out, compiled, opts, matchedAny, matchedToken) => {
|
|
789
|
-
const ws = skipWhitespace$1(s, pos, out, opts.whitespace);
|
|
790
|
-
if (ws.skipped) return {
|
|
791
|
-
done: false,
|
|
792
|
-
matchedAny,
|
|
793
|
-
matchedToken,
|
|
794
|
-
out: ws.out,
|
|
795
|
-
pos: ws.pos,
|
|
796
|
-
steps: 0
|
|
797
|
-
};
|
|
798
|
-
const tok = appendToken(s, pos, out, compiled);
|
|
799
|
-
if (tok.matched) return {
|
|
800
|
-
done: false,
|
|
801
|
-
matchedAny: true,
|
|
802
|
-
matchedToken: true,
|
|
803
|
-
out: tok.out,
|
|
804
|
-
pos: tok.pos,
|
|
805
|
-
steps: 1
|
|
806
|
-
};
|
|
807
|
-
if (matchedAny) {
|
|
808
|
-
const delim = appendDelimiter(s, pos, out);
|
|
809
|
-
if (delim.matched) return {
|
|
810
|
-
done: false,
|
|
811
|
-
matchedAny,
|
|
812
|
-
matchedToken,
|
|
813
|
-
out: delim.out,
|
|
814
|
-
pos: delim.pos,
|
|
815
|
-
steps: 0
|
|
816
|
-
};
|
|
817
|
-
if (opts.includeFirstWordFallback && !matchedToken) {
|
|
818
|
-
const fallback = appendFallbackWord(s, pos, out);
|
|
819
|
-
if (fallback) return {
|
|
820
|
-
done: true,
|
|
821
|
-
matchedAny,
|
|
822
|
-
matchedToken,
|
|
823
|
-
out: fallback,
|
|
824
|
-
pos,
|
|
825
|
-
steps: 1
|
|
826
|
-
};
|
|
827
|
-
}
|
|
828
|
-
return {
|
|
829
|
-
done: true,
|
|
830
|
-
matchedAny,
|
|
831
|
-
matchedToken,
|
|
832
|
-
out,
|
|
833
|
-
pos,
|
|
834
|
-
steps: 0
|
|
835
|
-
};
|
|
836
|
-
}
|
|
837
|
-
if (!opts.includeFirstWordFallback) return {
|
|
838
|
-
done: true,
|
|
839
|
-
matchedAny,
|
|
840
|
-
matchedToken,
|
|
841
|
-
out,
|
|
842
|
-
pos,
|
|
843
|
-
steps: 0
|
|
844
|
-
};
|
|
845
|
-
const fallback = appendFallbackWord(s, pos, out);
|
|
846
|
-
return fallback ? {
|
|
847
|
-
done: true,
|
|
848
|
-
matchedAny: true,
|
|
849
|
-
matchedToken,
|
|
850
|
-
out: fallback,
|
|
851
|
-
pos,
|
|
852
|
-
steps: 0
|
|
853
|
-
} : {
|
|
854
|
-
done: true,
|
|
855
|
-
matchedAny,
|
|
856
|
-
matchedToken,
|
|
857
|
-
out,
|
|
858
|
-
pos,
|
|
859
|
-
steps: 0
|
|
860
|
-
};
|
|
861
|
-
};
|
|
862
|
-
/** Remove trailing whitespace placeholders */
|
|
863
|
-
const trimTrailingWs = (out, mode) => {
|
|
864
|
-
const suffix = mode === "regex" ? "\\s*" : " ";
|
|
865
|
-
while (out.endsWith(suffix)) out = out.slice(0, -suffix.length);
|
|
866
|
-
return out;
|
|
867
|
-
};
|
|
868
|
-
/** Try to extract first word for fallback */
|
|
869
|
-
const extractFirstWord = (s) => (s.match(/^[^\s:،؛.?!؟]+/u) ?? [])[0] ?? null;
|
|
870
|
-
/** Skip whitespace at position */
|
|
871
|
-
const skipWhitespace$1 = (s, pos, out, ws) => {
|
|
872
|
-
const m = /^[ \t]+/u.exec(s.slice(pos));
|
|
873
|
-
if (!m) return {
|
|
874
|
-
out,
|
|
875
|
-
pos,
|
|
876
|
-
skipped: false
|
|
877
|
-
};
|
|
878
|
-
return {
|
|
879
|
-
out: appendWs(out, ws),
|
|
880
|
-
pos: pos + m[0].length,
|
|
881
|
-
skipped: true
|
|
882
|
-
};
|
|
883
|
-
};
|
|
884
|
-
const tokenizeLineStart = (line, tokenNames, opts) => {
|
|
885
|
-
const trimmed = collapseWhitespace(line);
|
|
886
|
-
if (!trimmed) return null;
|
|
887
|
-
const s = (opts.normalizeArabicDiacritics ? stripArabicDiacritics(trimmed) : trimmed).slice(0, opts.prefixChars);
|
|
888
|
-
const compiled = compileTokenRegexes(tokenNames);
|
|
889
|
-
let pos = 0, out = "", matchedAny = false, matchedToken = false, steps = 0;
|
|
890
|
-
const prefix = appendPrefix(s, pos, out, opts.prefixMatchers, opts.whitespace);
|
|
891
|
-
pos = prefix.pos;
|
|
892
|
-
out = prefix.out;
|
|
893
|
-
matchedAny = prefix.matched;
|
|
894
|
-
while (steps < 6 && pos < s.length) {
|
|
895
|
-
const next = consumeLineStartStep(s, pos, out, compiled, opts, matchedAny, matchedToken);
|
|
896
|
-
if (next.done) {
|
|
897
|
-
if (!next.matchedAny && !next.matchedToken && next.out === out && next.pos === pos) return null;
|
|
898
|
-
if (next.steps > 0) steps += next.steps;
|
|
899
|
-
matchedAny = next.matchedAny;
|
|
900
|
-
matchedToken = next.matchedToken;
|
|
901
|
-
out = next.out;
|
|
902
|
-
break;
|
|
903
|
-
}
|
|
904
|
-
pos = next.pos;
|
|
905
|
-
out = next.out;
|
|
906
|
-
matchedAny = next.matchedAny;
|
|
907
|
-
matchedToken = next.matchedToken;
|
|
908
|
-
steps += next.steps;
|
|
909
|
-
}
|
|
910
|
-
return matchedAny ? trimTrailingWs(out, opts.whitespace) : null;
|
|
911
|
-
};
|
|
912
|
-
const processLine = (line, pageId, tokenPriority, opts, acc) => {
|
|
913
|
-
const trimmed = collapseWhitespace(line);
|
|
914
|
-
if (trimmed.length < opts.minLineLength) return;
|
|
915
|
-
if (opts.lineFilter && !opts.lineFilter(trimmed, pageId)) return;
|
|
916
|
-
const sig = tokenizeLineStart(trimmed, tokenPriority, opts);
|
|
917
|
-
if (!sig) return;
|
|
918
|
-
const entry = acc.get(sig);
|
|
919
|
-
if (!entry) acc.set(sig, {
|
|
920
|
-
count: 1,
|
|
921
|
-
examples: [{
|
|
922
|
-
line: trimmed,
|
|
923
|
-
pageId
|
|
924
|
-
}]
|
|
925
|
-
});
|
|
926
|
-
else {
|
|
927
|
-
entry.count++;
|
|
928
|
-
if (entry.examples.length < opts.maxExamples) entry.examples.push({
|
|
929
|
-
line: trimmed,
|
|
930
|
-
pageId
|
|
931
|
-
});
|
|
932
|
-
}
|
|
933
|
-
};
|
|
934
|
-
const processPage = (page, tokenPriority, opts, acc) => {
|
|
935
|
-
for (const line of normalizeLineEndings(page.content ?? "").split("\n")) processLine(line, page.id, tokenPriority, opts, acc);
|
|
936
|
-
};
|
|
937
|
-
/**
|
|
938
|
-
* Analyze pages and return the most common line-start patterns (top K).
|
|
939
|
-
*/
|
|
940
|
-
const analyzeCommonLineStarts = (pages, options = {}) => {
|
|
941
|
-
const opts = resolveOptions$1(options);
|
|
942
|
-
const tokenPriority = buildTokenPriority();
|
|
943
|
-
const acc = /* @__PURE__ */ new Map();
|
|
944
|
-
for (const page of pages) processPage(page, tokenPriority, opts, acc);
|
|
945
|
-
const comparator = opts.sortBy === "count" ? compareByCount : compareBySpecificity;
|
|
946
|
-
return [...acc.entries()].map(([pattern, v]) => ({
|
|
947
|
-
count: v.count,
|
|
948
|
-
examples: v.examples,
|
|
949
|
-
pattern
|
|
950
|
-
})).filter((p) => p.count >= opts.minCount).sort(comparator).slice(0, opts.topK);
|
|
951
|
-
};
|
|
952
|
-
//#endregion
|
|
953
|
-
//#region src/analysis/repeating-sequences.ts
|
|
954
|
-
const resolveOptions = (options) => {
|
|
955
|
-
const minElements = Math.max(1, options?.minElements ?? 1);
|
|
956
|
-
return {
|
|
957
|
-
contextChars: options?.contextChars ?? 50,
|
|
958
|
-
maxElements: Math.max(minElements, options?.maxElements ?? 3),
|
|
959
|
-
maxExamples: options?.maxExamples ?? 3,
|
|
960
|
-
maxUniquePatterns: options?.maxUniquePatterns ?? 1e3,
|
|
961
|
-
minCount: Math.max(1, options?.minCount ?? 3),
|
|
962
|
-
minElements,
|
|
963
|
-
normalizeArabicDiacritics: options?.normalizeArabicDiacritics ?? true,
|
|
964
|
-
requireToken: options?.requireToken ?? true,
|
|
965
|
-
topK: Math.max(1, options?.topK ?? 20),
|
|
966
|
-
whitespace: options?.whitespace ?? "regex"
|
|
967
|
-
};
|
|
968
|
-
};
|
|
969
|
-
/** Creates a cursor that tracks position in both normalized and raw text */
|
|
970
|
-
const createRawCursor = (text, normalize) => {
|
|
971
|
-
let rawPos = 0;
|
|
972
|
-
return {
|
|
973
|
-
/** Advance cursor, returning the raw text chunk consumed */
|
|
974
|
-
advance(normalizedLen) {
|
|
975
|
-
if (!normalize) {
|
|
976
|
-
const chunk = text.slice(rawPos, rawPos + normalizedLen);
|
|
977
|
-
rawPos += normalizedLen;
|
|
978
|
-
return chunk;
|
|
979
|
-
}
|
|
980
|
-
const start = rawPos;
|
|
981
|
-
let matchedLen = 0;
|
|
982
|
-
while (matchedLen < normalizedLen && rawPos < text.length) {
|
|
983
|
-
if (stripArabicDiacritics(text[rawPos]).length > 0) matchedLen++;
|
|
984
|
-
rawPos++;
|
|
985
|
-
}
|
|
986
|
-
while (rawPos < text.length && stripArabicDiacritics(text[rawPos]).length === 0) rawPos++;
|
|
987
|
-
return text.slice(start, rawPos);
|
|
988
|
-
},
|
|
989
|
-
get pos() {
|
|
990
|
-
return rawPos;
|
|
991
|
-
}
|
|
992
|
-
};
|
|
993
|
-
};
|
|
994
|
-
/** Scans text and produces a stream of tokens and literals. */
|
|
995
|
-
const tokenizeContent = (text, normalize) => {
|
|
996
|
-
const normalized = normalize ? stripArabicDiacritics(text) : text;
|
|
997
|
-
const compiled = compileTokenRegexes(buildTokenPriority());
|
|
998
|
-
const cursor = createRawCursor(text, normalize);
|
|
999
|
-
const items = [];
|
|
1000
|
-
let pos = 0;
|
|
1001
|
-
while (pos < normalized.length) {
|
|
1002
|
-
const ws = /^\s+/u.exec(normalized.slice(pos));
|
|
1003
|
-
if (ws) {
|
|
1004
|
-
pos += ws[0].length;
|
|
1005
|
-
cursor.advance(ws[0].length);
|
|
1006
|
-
continue;
|
|
1007
|
-
}
|
|
1008
|
-
const token = findBestTokenMatchAt(normalized, pos, compiled, isArabicLetter);
|
|
1009
|
-
if (token) {
|
|
1010
|
-
const raw = cursor.advance(token.text.length);
|
|
1011
|
-
items.push({
|
|
1012
|
-
end: cursor.pos,
|
|
1013
|
-
raw,
|
|
1014
|
-
start: cursor.pos - raw.length,
|
|
1015
|
-
text: `{{${token.token}}}`,
|
|
1016
|
-
type: "token"
|
|
1017
|
-
});
|
|
1018
|
-
pos += token.text.length;
|
|
1019
|
-
continue;
|
|
1020
|
-
}
|
|
1021
|
-
if (isCommonDelimiter(normalized[pos])) {
|
|
1022
|
-
const raw = cursor.advance(1);
|
|
1023
|
-
items.push({
|
|
1024
|
-
end: cursor.pos,
|
|
1025
|
-
raw,
|
|
1026
|
-
start: cursor.pos - 1,
|
|
1027
|
-
text: escapeSignatureLiteral(normalized[pos]),
|
|
1028
|
-
type: "literal"
|
|
1029
|
-
});
|
|
1030
|
-
pos++;
|
|
1031
|
-
continue;
|
|
1032
|
-
}
|
|
1033
|
-
const word = /^[^\s::\-–—ـ،؛.?!؟()[\]{}]+/u.exec(normalized.slice(pos));
|
|
1034
|
-
if (word) {
|
|
1035
|
-
const raw = cursor.advance(word[0].length);
|
|
1036
|
-
items.push({
|
|
1037
|
-
end: cursor.pos,
|
|
1038
|
-
raw,
|
|
1039
|
-
start: cursor.pos - raw.length,
|
|
1040
|
-
text: escapeSignatureLiteral(word[0]),
|
|
1041
|
-
type: "literal"
|
|
1042
|
-
});
|
|
1043
|
-
pos += word[0].length;
|
|
1044
|
-
continue;
|
|
1045
|
-
}
|
|
1046
|
-
cursor.advance(1);
|
|
1047
|
-
pos++;
|
|
1048
|
-
}
|
|
1049
|
-
return items;
|
|
1050
|
-
};
|
|
1051
|
-
/** Build pattern string from window items */
|
|
1052
|
-
const buildPattern = (window, whitespace) => window.map((i) => i.text).join(whitespace === "space" ? " " : "\\s*");
|
|
1053
|
-
/** Check if window contains at least one token */
|
|
1054
|
-
const hasTokenInWindow = (window) => window.some((i) => i.type === "token");
|
|
1055
|
-
/** Compute token count and literal length for a window */
|
|
1056
|
-
const computeWindowStats = (window) => {
|
|
1057
|
-
let tokenCount = 0, literalLen = 0;
|
|
1058
|
-
for (const item of window) if (item.type === "token") tokenCount++;
|
|
1059
|
-
else literalLen += item.text.length;
|
|
1060
|
-
return {
|
|
1061
|
-
literalLen,
|
|
1062
|
-
tokenCount
|
|
1063
|
-
};
|
|
1064
|
-
};
|
|
1065
|
-
/** Build example from page content and window */
|
|
1066
|
-
const buildExample = (page, window, contextChars) => {
|
|
1067
|
-
const start = window[0].start;
|
|
1068
|
-
const end = window.at(-1).end;
|
|
1069
|
-
const ctxStart = Math.max(0, start - contextChars);
|
|
1070
|
-
const ctxEnd = Math.min(page.content.length, end + contextChars);
|
|
1071
|
-
return {
|
|
1072
|
-
context: (ctxStart > 0 ? "..." : "") + page.content.slice(ctxStart, ctxEnd) + (ctxEnd < page.content.length ? "..." : ""),
|
|
1073
|
-
pageId: page.id,
|
|
1074
|
-
startIndices: window.map((w) => w.start),
|
|
1075
|
-
text: page.content.slice(start, end)
|
|
1076
|
-
};
|
|
1077
|
-
};
|
|
1078
|
-
const recordPattern = (page, window, opts, stats) => {
|
|
1079
|
-
if (opts.requireToken && !hasTokenInWindow(window)) return;
|
|
1080
|
-
const pattern = buildPattern(window, opts.whitespace);
|
|
1081
|
-
let entry = stats.get(pattern);
|
|
1082
|
-
if (!entry) {
|
|
1083
|
-
if (stats.size >= opts.maxUniquePatterns) return;
|
|
1084
|
-
entry = {
|
|
1085
|
-
count: 0,
|
|
1086
|
-
examples: [],
|
|
1087
|
-
...computeWindowStats(window)
|
|
1088
|
-
};
|
|
1089
|
-
stats.set(pattern, entry);
|
|
1090
|
-
}
|
|
1091
|
-
entry.count++;
|
|
1092
|
-
if (entry.examples.length < opts.maxExamples) entry.examples.push(buildExample(page, window, opts.contextChars));
|
|
1093
|
-
};
|
|
1094
|
-
/** Extract N-grams from a single page */
|
|
1095
|
-
const extractPageNgrams = (page, items, opts, stats) => {
|
|
1096
|
-
for (let i = 0; i <= items.length - opts.minElements; i++) {
|
|
1097
|
-
const maxWindowSize = Math.min(opts.maxElements, items.length - i);
|
|
1098
|
-
for (let n = opts.minElements; n <= maxWindowSize; n++) recordPattern(page, items.slice(i, i + n), opts, stats);
|
|
1099
|
-
}
|
|
1100
|
-
};
|
|
1101
|
-
/**
|
|
1102
|
-
* Analyze pages for commonly repeating word sequences.
|
|
1103
|
-
*
|
|
1104
|
-
* Use for continuous text without line breaks. For line-based analysis,
|
|
1105
|
-
* use `analyzeCommonLineStarts()` instead.
|
|
1106
|
-
*/
|
|
1107
|
-
const analyzeRepeatingSequences = (pages, options) => {
|
|
1108
|
-
const opts = resolveOptions(options);
|
|
1109
|
-
const stats = /* @__PURE__ */ new Map();
|
|
1110
|
-
for (const page of pages) {
|
|
1111
|
-
if (!page.content) continue;
|
|
1112
|
-
extractPageNgrams(page, tokenizeContent(page.content, opts.normalizeArabicDiacritics), opts, stats);
|
|
1113
|
-
}
|
|
1114
|
-
return [...stats.entries()].filter(([, s]) => s.count >= opts.minCount).sort((a, b) => b[1].count - a[1].count || b[1].tokenCount - a[1].tokenCount || b[1].literalLen - a[1].literalLen).slice(0, opts.topK).map(([pattern, s]) => ({
|
|
1115
|
-
count: s.count,
|
|
1116
|
-
examples: s.examples,
|
|
1117
|
-
pattern
|
|
1118
|
-
}));
|
|
1119
|
-
};
|
|
1120
|
-
//#endregion
|
|
1
|
+
import { A as ARABIC_MARKS_CLASS, B as getTokenPattern, C as analyzeCommonLineStarts, D as normalizeArabicForComparison, E as makeDiacriticInsensitive, F as containsTokens, H as stripTokenMappings, I as expandCompositeTokensInTemplate, L as expandTokens, M as TOKEN_PATTERNS, N as Token, O as ARABIC_BASE_LETTER_CLASS, P as applyTokenMappings, R as expandTokensWithCaptures, S as analyzeRepeatingSequences, T as escapeTemplateBrackets, U as templateToRegex, V as shouldDefaultToFuzzy, W as withCapture, _ as removeZeroWidth, a as diagnoseDictionaryProfile, b as optimizeRules, c as analyzeDictionaryMarkdownPages, d as getDebugReason, f as getSegmentDebugReason, g as fixTrailingWaw, h as condenseEllipsis, i as escapeWordsOutsideTokens, j as ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, k as ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, l as classifyDictionaryHeading, m as applyPreprocessToPage, n as segmentPages, o as DictionaryProfileValidationError, p as validateSegments, r as createArabicDictionaryEntryRule, s as validateDictionaryProfile, t as suggestSegmentationOptions, u as scanDictionaryMarkdownPage, v as formatValidationReport, w as escapeRegex, x as PATTERN_TYPE_KEYS, y as validateRules, z as getAvailableTokens } from "./segmentation-advisor-D375TL8-.mjs";
|
|
1121
2
|
//#region src/detection.ts
|
|
1122
3
|
/**
|
|
1123
4
|
* Token detection order - more specific patterns first to avoid partial matches.
|
|
@@ -1279,3330 +160,6 @@ const analyzeTextForRule = (text) => {
|
|
|
1279
160
|
};
|
|
1280
161
|
};
|
|
1281
162
|
//#endregion
|
|
1282
|
-
|
|
1283
|
-
/**
|
|
1284
|
-
* Pattern type key names for split rules.
|
|
1285
|
-
*
|
|
1286
|
-
* Use this array to dynamically iterate over pattern types in UIs,
|
|
1287
|
-
* or use the `PatternTypeKey` type for type-safe string unions.
|
|
1288
|
-
*
|
|
1289
|
-
* @example
|
|
1290
|
-
* // Build a dropdown/select in UI
|
|
1291
|
-
* PATTERN_TYPE_KEYS.map(key => <option value={key}>{key}</option>)
|
|
1292
|
-
*
|
|
1293
|
-
* @example
|
|
1294
|
-
* // Type-safe pattern key validation
|
|
1295
|
-
* const validateKey = (k: string): k is PatternTypeKey =>
|
|
1296
|
-
* (PATTERN_TYPE_KEYS as readonly string[]).includes(k);
|
|
1297
|
-
*/
|
|
1298
|
-
const PATTERN_TYPE_KEYS = [
|
|
1299
|
-
"lineStartsWith",
|
|
1300
|
-
"lineStartsAfter",
|
|
1301
|
-
"lineEndsWith",
|
|
1302
|
-
"template",
|
|
1303
|
-
"regex",
|
|
1304
|
-
"dictionaryEntry"
|
|
1305
|
-
];
|
|
1306
|
-
//#endregion
|
|
1307
|
-
//#region src/optimization/optimize-rules.ts
|
|
1308
|
-
const MERGEABLE_KEYS = new Set([
|
|
1309
|
-
"lineStartsWith",
|
|
1310
|
-
"lineStartsAfter",
|
|
1311
|
-
"lineEndsWith"
|
|
1312
|
-
]);
|
|
1313
|
-
/**
|
|
1314
|
-
* Get the pattern type key for a rule.
|
|
1315
|
-
*/
|
|
1316
|
-
const getPatternKey = (rule) => PATTERN_TYPE_KEYS.find((key) => key in rule) ?? "regex";
|
|
1317
|
-
const getPatternArray = (rule, key) => {
|
|
1318
|
-
const value = rule[key];
|
|
1319
|
-
return Array.isArray(value) ? value : [];
|
|
1320
|
-
};
|
|
1321
|
-
const getPatternString = (rule, key) => {
|
|
1322
|
-
const value = rule[key];
|
|
1323
|
-
return typeof value === "string" ? value : Array.isArray(value) ? value.join("\n") : value ? JSON.stringify(value) : "";
|
|
1324
|
-
};
|
|
1325
|
-
const normalizePatterns = (patterns) => [...new Set(patterns)].sort((a, b) => b.length - a.length || a.localeCompare(b));
|
|
1326
|
-
const getDictionaryEntrySpecificityScore = (rule) => {
|
|
1327
|
-
if (!("dictionaryEntry" in rule)) return 0;
|
|
1328
|
-
const { allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, maxLetters = 10, midLineSubentries = true, minLetters = 2, stopWords } = rule.dictionaryEntry;
|
|
1329
|
-
return minLetters * 20 + maxLetters + (allowCommaSeparated ? 0 : 120) + (allowParenthesized ? 0 : 60) + (allowWhitespaceBeforeColon ? 0 : 20) + (midLineSubentries ? 0 : 160) + Math.min(stopWords.length, 25);
|
|
1330
|
-
};
|
|
1331
|
-
const getSpecificityScore = (rule) => {
|
|
1332
|
-
const key = getPatternKey(rule);
|
|
1333
|
-
if (key === "dictionaryEntry") return getDictionaryEntrySpecificityScore(rule);
|
|
1334
|
-
return MERGEABLE_KEYS.has(key) ? getPatternArray(rule, key).reduce((max, p) => Math.max(max, p.length), 0) : getPatternString(rule, key).length;
|
|
1335
|
-
};
|
|
1336
|
-
const createMergeKey = (rule) => {
|
|
1337
|
-
const key = getPatternKey(rule);
|
|
1338
|
-
const { [key]: _, ...rest } = rule;
|
|
1339
|
-
return `${key}|${JSON.stringify(rest)}`;
|
|
1340
|
-
};
|
|
1341
|
-
const optimizeRules = (rules) => {
|
|
1342
|
-
const output = [];
|
|
1343
|
-
const indexByMergeKey = /* @__PURE__ */ new Map();
|
|
1344
|
-
let mergedCount = 0;
|
|
1345
|
-
for (const rule of rules) {
|
|
1346
|
-
const key = getPatternKey(rule);
|
|
1347
|
-
if (!MERGEABLE_KEYS.has(key)) {
|
|
1348
|
-
output.push(rule);
|
|
1349
|
-
continue;
|
|
1350
|
-
}
|
|
1351
|
-
const mergeKey = createMergeKey(rule);
|
|
1352
|
-
const existingIndex = indexByMergeKey.get(mergeKey);
|
|
1353
|
-
if (existingIndex === void 0) {
|
|
1354
|
-
indexByMergeKey.set(mergeKey, output.length);
|
|
1355
|
-
output.push({
|
|
1356
|
-
...rule,
|
|
1357
|
-
[key]: normalizePatterns(getPatternArray(rule, key))
|
|
1358
|
-
});
|
|
1359
|
-
} else {
|
|
1360
|
-
const existing = output[existingIndex];
|
|
1361
|
-
existing[key] = normalizePatterns([...getPatternArray(existing, key), ...getPatternArray(rule, key)]);
|
|
1362
|
-
mergedCount++;
|
|
1363
|
-
}
|
|
1364
|
-
}
|
|
1365
|
-
return {
|
|
1366
|
-
mergedCount,
|
|
1367
|
-
rules: output.sort((a, b) => getSpecificityScore(b) - getSpecificityScore(a))
|
|
1368
|
-
};
|
|
1369
|
-
};
|
|
1370
|
-
//#endregion
|
|
1371
|
-
//#region src/preprocessing/transforms.ts
|
|
1372
|
-
/** Helper for exhaustive switch checking - TypeScript will error if a case is missed */
|
|
1373
|
-
const assertNever = (x) => {
|
|
1374
|
-
throw new Error(`Unknown preprocess transform type: ${JSON.stringify(x)}`);
|
|
1375
|
-
};
|
|
1376
|
-
/** Check if a character is whitespace (space, newline, tab, etc.) */
|
|
1377
|
-
const isWhitespace = (char) => /\s/.test(char);
|
|
1378
|
-
/**
|
|
1379
|
-
* Check if a character code is a zero-width control character.
|
|
1380
|
-
*
|
|
1381
|
-
* Covers:
|
|
1382
|
-
* - U+200B–U+200F (Zero Width Space, Joiners, Direction Marks)
|
|
1383
|
-
* - U+202A–U+202E (Bidirectional Formatting)
|
|
1384
|
-
* - U+2060–U+2064 (Word Joiner, Invisible Operators)
|
|
1385
|
-
* - U+FEFF (Byte Order Mark / Zero Width No-Break Space)
|
|
1386
|
-
*/
|
|
1387
|
-
const isZeroWidth = (code) => code >= 8203 && code <= 8207 || code >= 8234 && code <= 8238 || code >= 8288 && code <= 8292 || code === 65279;
|
|
1388
|
-
/**
|
|
1389
|
-
* Remove zero-width control characters from text.
|
|
1390
|
-
*
|
|
1391
|
-
* @param text - Input text
|
|
1392
|
-
* @param mode - 'strip' (default) removes entirely, 'space' replaces with space
|
|
1393
|
-
* @returns Text with zero-width characters removed or replaced
|
|
1394
|
-
*/
|
|
1395
|
-
const removeZeroWidth = (text, mode = "strip") => {
|
|
1396
|
-
if (mode === "space") {
|
|
1397
|
-
const parts = [];
|
|
1398
|
-
let lastWasWhitespace = true;
|
|
1399
|
-
for (let i = 0; i < text.length; i++) if (isZeroWidth(text.charCodeAt(i))) {
|
|
1400
|
-
if (!lastWasWhitespace && parts.length > 0) {
|
|
1401
|
-
parts.push(" ");
|
|
1402
|
-
lastWasWhitespace = true;
|
|
1403
|
-
}
|
|
1404
|
-
} else {
|
|
1405
|
-
const char = text[i];
|
|
1406
|
-
parts.push(char);
|
|
1407
|
-
lastWasWhitespace = isWhitespace(char);
|
|
1408
|
-
}
|
|
1409
|
-
return parts.join("");
|
|
1410
|
-
}
|
|
1411
|
-
return text.replace(/[\u200B-\u200F\u202A-\u202E\u2060-\u2064\uFEFF]/g, "");
|
|
1412
|
-
};
|
|
1413
|
-
/**
|
|
1414
|
-
* Condense multiple periods (...) into ellipsis character (…).
|
|
1415
|
-
*
|
|
1416
|
-
* Prevents `{{tarqim}}` from false-matching inside ellipsis since
|
|
1417
|
-
* the `.` in tarqim matches individual periods.
|
|
1418
|
-
*
|
|
1419
|
-
* @param text - Input text
|
|
1420
|
-
* @returns Text with period sequences replaced by ellipsis
|
|
1421
|
-
*/
|
|
1422
|
-
const condenseEllipsis = (text) => text.replace(/\.{2,}/g, "…");
|
|
1423
|
-
/**
|
|
1424
|
-
* Join trailing و (waw) to the next word.
|
|
1425
|
-
*
|
|
1426
|
-
* Fixes OCR/digitization artifacts: ' و ' → ' و' (waw joined to next word)
|
|
1427
|
-
*
|
|
1428
|
-
* @param text - Input text
|
|
1429
|
-
* @returns Text with trailing waw joined to following word
|
|
1430
|
-
*/
|
|
1431
|
-
const fixTrailingWaw = (text) => text.replace(/ و /g, " و");
|
|
1432
|
-
/**
|
|
1433
|
-
* Check if a page ID is within a constraint range.
|
|
1434
|
-
*/
|
|
1435
|
-
const isInRange = (pageId, constraint) => {
|
|
1436
|
-
if (constraint.min !== void 0 && pageId < constraint.min) return false;
|
|
1437
|
-
if (constraint.max !== void 0 && pageId > constraint.max) return false;
|
|
1438
|
-
return true;
|
|
1439
|
-
};
|
|
1440
|
-
/**
|
|
1441
|
-
* Normalize a transform to its object form.
|
|
1442
|
-
*/
|
|
1443
|
-
const normalizeTransform = (transform) => {
|
|
1444
|
-
if (typeof transform === "string") return { type: transform };
|
|
1445
|
-
return transform;
|
|
1446
|
-
};
|
|
1447
|
-
/**
|
|
1448
|
-
* Apply preprocessing transforms to a page's content.
|
|
1449
|
-
*
|
|
1450
|
-
* Transforms run in array order. Each can be limited to specific pages
|
|
1451
|
-
* via `min`/`max` constraints.
|
|
1452
|
-
*
|
|
1453
|
-
* @param content - Page content to transform
|
|
1454
|
-
* @param pageId - Page ID for constraint checking
|
|
1455
|
-
* @param transforms - Array of transforms to apply
|
|
1456
|
-
* @returns Transformed content
|
|
1457
|
-
*/
|
|
1458
|
-
const applyPreprocessToPage = (content, pageId, transforms) => {
|
|
1459
|
-
let result = content;
|
|
1460
|
-
for (const transform of transforms) {
|
|
1461
|
-
const rule = normalizeTransform(transform);
|
|
1462
|
-
if (!isInRange(pageId, rule)) continue;
|
|
1463
|
-
switch (rule.type) {
|
|
1464
|
-
case "removeZeroWidth":
|
|
1465
|
-
result = removeZeroWidth(result, rule.mode ?? "strip");
|
|
1466
|
-
break;
|
|
1467
|
-
case "condenseEllipsis":
|
|
1468
|
-
result = condenseEllipsis(result);
|
|
1469
|
-
break;
|
|
1470
|
-
case "fixTrailingWaw":
|
|
1471
|
-
result = fixTrailingWaw(result);
|
|
1472
|
-
break;
|
|
1473
|
-
default: assertNever(rule.type);
|
|
1474
|
-
}
|
|
1475
|
-
}
|
|
1476
|
-
return result;
|
|
1477
|
-
};
|
|
1478
|
-
//#endregion
|
|
1479
|
-
//#region src/segmentation/arabic-dictionary-rule.ts
|
|
1480
|
-
const uniqueCanonicalWords = (words) => {
|
|
1481
|
-
const seen = /* @__PURE__ */ new Set();
|
|
1482
|
-
const result = [];
|
|
1483
|
-
for (const word of words) {
|
|
1484
|
-
const normalized = normalizeArabicForComparison(word);
|
|
1485
|
-
if (!normalized || seen.has(normalized)) continue;
|
|
1486
|
-
seen.add(normalized);
|
|
1487
|
-
result.push(word);
|
|
1488
|
-
}
|
|
1489
|
-
return result;
|
|
1490
|
-
};
|
|
1491
|
-
const buildStopAlternation = (stopWords) => {
|
|
1492
|
-
const unique = uniqueCanonicalWords(stopWords);
|
|
1493
|
-
if (unique.length === 0) return "";
|
|
1494
|
-
return unique.map((word) => makeDiacriticInsensitive(normalizeArabicForComparison(word))).join("|");
|
|
1495
|
-
};
|
|
1496
|
-
const buildHeadwordBody = ({ allowCommaSeparated, colonPattern, stopAlternation, stopwordBody, unit }) => {
|
|
1497
|
-
if (!stopAlternation) return allowCommaSeparated ? `${unit}(?:\\s*[،,]\\s*${unit})*` : unit;
|
|
1498
|
-
const guardedUnit = `(?!(?:${stopwordBody})${allowCommaSeparated ? `(?:\\s*[،,]\\s*|${colonPattern})` : colonPattern})${unit}`;
|
|
1499
|
-
return allowCommaSeparated ? `${guardedUnit}(?:\\s*[،,]\\s*${guardedUnit})*` : guardedUnit;
|
|
1500
|
-
};
|
|
1501
|
-
const buildBalancedMarker = ({ allowParenthesized, allowWhitespaceBeforeColon, captureName, headwordBody }) => {
|
|
1502
|
-
const colon = allowWhitespaceBeforeColon ? "\\s*:" : ":";
|
|
1503
|
-
const withCapture = `(?<${captureName}>${headwordBody})`;
|
|
1504
|
-
if (!allowParenthesized) return `${withCapture}${colon}`;
|
|
1505
|
-
return `(?:\\(\\s*${withCapture}\\s*\\)|${withCapture})${colon}`;
|
|
1506
|
-
};
|
|
1507
|
-
const validateDictionaryEntryOptions = ({ captureName = "lemma", maxLetters = 10, minLetters = 2 }) => {
|
|
1508
|
-
if (!Number.isInteger(minLetters) || minLetters < 1) throw new Error(`createArabicDictionaryEntryRule: minLetters must be an integer >= 1, got ${minLetters}`);
|
|
1509
|
-
if (!Number.isInteger(maxLetters) || maxLetters < minLetters) throw new Error(`createArabicDictionaryEntryRule: maxLetters must be an integer >= minLetters, got ${maxLetters}`);
|
|
1510
|
-
if (!captureName.match(/^[A-Za-z_]\w*$/)) throw new Error(`createArabicDictionaryEntryRule: invalid captureName "${captureName}"`);
|
|
1511
|
-
};
|
|
1512
|
-
const buildArabicDictionaryEntryRegexSource = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, midLineSubentries = true, minLetters = 2, stopWords }, capturePrefix) => {
|
|
1513
|
-
validateDictionaryEntryOptions({
|
|
1514
|
-
captureName,
|
|
1515
|
-
maxLetters,
|
|
1516
|
-
minLetters
|
|
1517
|
-
});
|
|
1518
|
-
const zeroWidthPrefix = "[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*";
|
|
1519
|
-
const wawWithMarks = `و${ARABIC_MARKS_CLASS}*`;
|
|
1520
|
-
const alWithMarks = `ا${ARABIC_MARKS_CLASS}*ل${ARABIC_MARKS_CLASS}*`;
|
|
1521
|
-
const lemmaUnit = `(?:${wawWithMarks})?(?:${alWithMarks})?${`${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}(?:${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}){${minLetters - 1},${maxLetters - 1}}`}`;
|
|
1522
|
-
const stopAlternation = buildStopAlternation(stopWords);
|
|
1523
|
-
const lemmaBody = buildHeadwordBody({
|
|
1524
|
-
allowCommaSeparated,
|
|
1525
|
-
colonPattern: allowWhitespaceBeforeColon ? "\\s*:" : ":",
|
|
1526
|
-
stopAlternation,
|
|
1527
|
-
stopwordBody: stopAlternation ? `(?:${wawWithMarks})?(?:${stopAlternation})` : "",
|
|
1528
|
-
unit: lemmaUnit
|
|
1529
|
-
});
|
|
1530
|
-
const lineStartBoundary = `(?:(?<=^)|(?<=\\n))${zeroWidthPrefix}`;
|
|
1531
|
-
const midLineTrigger = allowParenthesized ? `(?<=\\s)(?=(?:\\(\\s*)?${wawWithMarks}(?:${alWithMarks})?)` : `(?<=\\s)(?=${wawWithMarks}(?:${alWithMarks})?)`;
|
|
1532
|
-
const prefixedCaptureName = capturePrefix ? `${capturePrefix}${captureName}` : captureName;
|
|
1533
|
-
const regex = `(?:${lineStartBoundary}${midLineSubentries ? `|${midLineTrigger}` : ""})` + buildBalancedMarker({
|
|
1534
|
-
allowParenthesized,
|
|
1535
|
-
allowWhitespaceBeforeColon,
|
|
1536
|
-
captureName: prefixedCaptureName,
|
|
1537
|
-
headwordBody: lemmaBody
|
|
1538
|
-
});
|
|
1539
|
-
return {
|
|
1540
|
-
captureNames: [prefixedCaptureName],
|
|
1541
|
-
regex
|
|
1542
|
-
};
|
|
1543
|
-
};
|
|
1544
|
-
/**
|
|
1545
|
-
* Creates a reusable split rule for Arabic dictionary entries.
|
|
1546
|
-
*
|
|
1547
|
-
* The returned rule preserves authoring intent as a serializable
|
|
1548
|
-
* `{ dictionaryEntry: ... }` pattern rather than eagerly compiling to a raw
|
|
1549
|
-
* regex string.
|
|
1550
|
-
*
|
|
1551
|
-
* @example
|
|
1552
|
-
* createArabicDictionaryEntryRule({
|
|
1553
|
-
* stopWords: ['وقيل', 'ويقال', 'قال'],
|
|
1554
|
-
* pageStartPrevWordStoplist: ['قال', 'وقيل', 'ويقال'],
|
|
1555
|
-
* })
|
|
1556
|
-
*
|
|
1557
|
-
* @example
|
|
1558
|
-
* createArabicDictionaryEntryRule({
|
|
1559
|
-
* allowParenthesized: true,
|
|
1560
|
-
* allowWhitespaceBeforeColon: true,
|
|
1561
|
-
* allowCommaSeparated: true,
|
|
1562
|
-
* stopWords: ['الليث', 'العجاج'],
|
|
1563
|
-
* })
|
|
1564
|
-
*/
|
|
1565
|
-
const createArabicDictionaryEntryRule = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, meta, midLineSubentries = true, minLetters = 2, pageStartPrevWordStoplist, samePagePrevWordStoplist, stopWords }) => {
|
|
1566
|
-
validateDictionaryEntryOptions({
|
|
1567
|
-
captureName,
|
|
1568
|
-
maxLetters,
|
|
1569
|
-
minLetters
|
|
1570
|
-
});
|
|
1571
|
-
return {
|
|
1572
|
-
dictionaryEntry: {
|
|
1573
|
-
allowCommaSeparated,
|
|
1574
|
-
allowParenthesized,
|
|
1575
|
-
allowWhitespaceBeforeColon,
|
|
1576
|
-
captureName,
|
|
1577
|
-
maxLetters,
|
|
1578
|
-
midLineSubentries,
|
|
1579
|
-
minLetters,
|
|
1580
|
-
stopWords: uniqueCanonicalWords(stopWords)
|
|
1581
|
-
},
|
|
1582
|
-
meta,
|
|
1583
|
-
pageStartPrevWordStoplist,
|
|
1584
|
-
samePagePrevWordStoplist
|
|
1585
|
-
};
|
|
1586
|
-
};
|
|
1587
|
-
const WINDOW_PREFIX_LENGTHS = [
|
|
1588
|
-
80,
|
|
1589
|
-
60,
|
|
1590
|
-
40,
|
|
1591
|
-
30,
|
|
1592
|
-
20,
|
|
1593
|
-
15
|
|
1594
|
-
];
|
|
1595
|
-
const JOINER_PREFIX_LENGTHS = [
|
|
1596
|
-
80,
|
|
1597
|
-
60,
|
|
1598
|
-
40,
|
|
1599
|
-
30,
|
|
1600
|
-
20,
|
|
1601
|
-
15,
|
|
1602
|
-
12,
|
|
1603
|
-
10,
|
|
1604
|
-
8,
|
|
1605
|
-
6
|
|
1606
|
-
];
|
|
1607
|
-
const STOP_CHARACTERS = /[\s\n.,;!?؛،۔۞]/;
|
|
1608
|
-
/**
|
|
1609
|
-
* Maximum allowed deviation between expected and actual boundary positions (characters).
|
|
1610
|
-
* Matches outside this range are rejected unless `ignoreDeviation` is active.
|
|
1611
|
-
*/
|
|
1612
|
-
const MAX_DEVIATION = 2e3;
|
|
1613
|
-
//#endregion
|
|
1614
|
-
//#region src/segmentation/match-utils.ts
|
|
1615
|
-
/**
|
|
1616
|
-
* Extracts named capture groups from a regex match.
|
|
1617
|
-
*
|
|
1618
|
-
* Only includes groups that are in the `captureNames` list and have
|
|
1619
|
-
* defined values. This filters out positional captures and ensures
|
|
1620
|
-
* only explicitly requested named captures are returned.
|
|
1621
|
-
*
|
|
1622
|
-
* @param groups - The `match.groups` object from `RegExp.exec()`
|
|
1623
|
-
* @param captureNames - List of capture names to extract (from `{{token:name}}` syntax)
|
|
1624
|
-
* @returns Object with capture name → value pairs, or `undefined` if none found
|
|
1625
|
-
*
|
|
1626
|
-
* @example
|
|
1627
|
-
* const match = /(?<num>[٠-٩]+) -/.exec('٦٦٩٦ - text');
|
|
1628
|
-
* extractNamedCaptures(match.groups, ['num'])
|
|
1629
|
-
* // → { num: '٦٦٩٦' }
|
|
1630
|
-
*
|
|
1631
|
-
* @example
|
|
1632
|
-
* // No matching captures
|
|
1633
|
-
* extractNamedCaptures({}, ['num'])
|
|
1634
|
-
* // → undefined
|
|
1635
|
-
*
|
|
1636
|
-
* @example
|
|
1637
|
-
* // Undefined groups
|
|
1638
|
-
* extractNamedCaptures(undefined, ['num'])
|
|
1639
|
-
* // → undefined
|
|
1640
|
-
*/
|
|
1641
|
-
const extractNamedCaptures = (groups, captureNames) => {
|
|
1642
|
-
if (!groups || captureNames.length === 0) return;
|
|
1643
|
-
const namedCaptures = {};
|
|
1644
|
-
for (const name of captureNames) if (groups[name] !== void 0) namedCaptures[name] = groups[name];
|
|
1645
|
-
return Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0;
|
|
1646
|
-
};
|
|
1647
|
-
/**
|
|
1648
|
-
* Gets the last defined positional capture group from a match array.
|
|
1649
|
-
*
|
|
1650
|
-
* Used for `lineStartsAfter` patterns where the content capture (`.*`)
|
|
1651
|
-
* is always at the end of the pattern. Named captures may shift the
|
|
1652
|
-
* positional indices, so we iterate backward to find the actual content.
|
|
1653
|
-
*
|
|
1654
|
-
* @param match - RegExp exec result array
|
|
1655
|
-
* @returns The last defined capture group value, or `undefined` if none
|
|
1656
|
-
*
|
|
1657
|
-
* @example
|
|
1658
|
-
* // Pattern: ^(?:(?<num>[٠-٩]+) - )(.*)
|
|
1659
|
-
* // Match array: ['٦٦٩٦ - content', '٦٦٩٦', 'content']
|
|
1660
|
-
* getLastPositionalCapture(match)
|
|
1661
|
-
* // → 'content'
|
|
1662
|
-
*
|
|
1663
|
-
* @example
|
|
1664
|
-
* // No captures
|
|
1665
|
-
* getLastPositionalCapture(['full match'])
|
|
1666
|
-
* // → undefined
|
|
1667
|
-
*/
|
|
1668
|
-
const getLastPositionalCapture = (match) => {
|
|
1669
|
-
if (match.length <= 1) return;
|
|
1670
|
-
for (let i = match.length - 1; i >= 1; i--) if (match[i] !== void 0) return match[i];
|
|
1671
|
-
};
|
|
1672
|
-
/**
|
|
1673
|
-
* Filters matches to only include those within page ID constraints.
|
|
1674
|
-
*
|
|
1675
|
-
* Applies the `min`, `max`, and `exclude` constraints from a rule to filter out
|
|
1676
|
-
* matches that occur on pages outside the allowed range or explicitly excluded.
|
|
1677
|
-
*
|
|
1678
|
-
* @param matches - Array of match results to filter
|
|
1679
|
-
* @param rule - Rule containing `min`, `max`, and/or `exclude` page constraints
|
|
1680
|
-
* @param getId - Function that returns the page ID for a given offset
|
|
1681
|
-
* @returns Filtered array containing only matches within constraints
|
|
1682
|
-
*
|
|
1683
|
-
* @example
|
|
1684
|
-
* const matches = [
|
|
1685
|
-
* { start: 0, end: 10 }, // Page 1
|
|
1686
|
-
* { start: 100, end: 110 }, // Page 5
|
|
1687
|
-
* { start: 200, end: 210 }, // Page 10
|
|
1688
|
-
* ];
|
|
1689
|
-
* filterByConstraints(matches, { min: 3, max: 8 }, getId)
|
|
1690
|
-
* // → [{ start: 100, end: 110 }] (only page 5 match)
|
|
1691
|
-
*/
|
|
1692
|
-
const filterByConstraints = (matches, rule, getId) => matches.filter((m) => {
|
|
1693
|
-
const id = getId(m.start);
|
|
1694
|
-
return (rule.min === void 0 || id >= rule.min) && (rule.max === void 0 || id <= rule.max) && !isPageExcluded(id, rule.exclude);
|
|
1695
|
-
});
|
|
1696
|
-
/**
|
|
1697
|
-
* Checks if any rule in the list allows the given page ID.
|
|
1698
|
-
*
|
|
1699
|
-
* A rule allows an ID if it falls within the rule's `min`/`max` constraints.
|
|
1700
|
-
* Rules without constraints allow all page IDs.
|
|
1701
|
-
*
|
|
1702
|
-
* This is used to determine whether to create a segment for content
|
|
1703
|
-
* that appears before any split points (the "first segment").
|
|
1704
|
-
*
|
|
1705
|
-
* @param rules - Array of rules with optional `min` and `max` constraints
|
|
1706
|
-
* @param pageId - Page ID to check
|
|
1707
|
-
* @returns `true` if at least one rule allows the page ID
|
|
1708
|
-
*
|
|
1709
|
-
* @example
|
|
1710
|
-
* const rules = [
|
|
1711
|
-
* { min: 5, max: 10 }, // Allows pages 5-10
|
|
1712
|
-
* { min: 20 }, // Allows pages 20+
|
|
1713
|
-
* ];
|
|
1714
|
-
*
|
|
1715
|
-
* anyRuleAllowsId(rules, 7) // → true (first rule allows)
|
|
1716
|
-
* anyRuleAllowsId(rules, 3) // → false (no rule allows)
|
|
1717
|
-
* anyRuleAllowsId(rules, 25) // → true (second rule allows)
|
|
1718
|
-
*
|
|
1719
|
-
* @example
|
|
1720
|
-
* // Rules without constraints allow everything
|
|
1721
|
-
* anyRuleAllowsId([{}], 999) // → true
|
|
1722
|
-
*/
|
|
1723
|
-
const anyRuleAllowsId = (rules, pageId) => rules.some((r) => (r.min === void 0 || pageId >= r.min) && (r.max === void 0 || pageId <= r.max));
|
|
1724
|
-
const extractDebugIndex = (groups, prefix) => {
|
|
1725
|
-
if (!groups) return;
|
|
1726
|
-
for (const key in groups) if (key.startsWith(prefix) && groups[key] !== void 0) {
|
|
1727
|
-
const idx = Number.parseInt(key.slice(prefix.length), 10);
|
|
1728
|
-
if (!Number.isNaN(idx)) return idx;
|
|
1729
|
-
}
|
|
1730
|
-
};
|
|
1731
|
-
//#endregion
|
|
1732
|
-
//#region src/segmentation/breakpoint-utils.ts
|
|
1733
|
-
/**
|
|
1734
|
-
* Escapes regex metacharacters outside of `{{token}}` delimiters.
|
|
1735
|
-
*
|
|
1736
|
-
* This allows words in the `words` field to contain tokens while treating
|
|
1737
|
-
* most other characters as literals.
|
|
1738
|
-
*
|
|
1739
|
-
* Note: `()[]` are NOT escaped here because `processPattern` will handle them
|
|
1740
|
-
* via `escapeTemplateBrackets`. This avoids double-escaping.
|
|
1741
|
-
*
|
|
1742
|
-
* @param word - Word string that may contain {{tokens}}
|
|
1743
|
-
* @returns String with metacharacters escaped outside tokens (except ()[] which are escaped by processPattern)
|
|
1744
|
-
*
|
|
1745
|
-
* @example
|
|
1746
|
-
* escapeWordsOutsideTokens('a.*b')
|
|
1747
|
-
* // → 'a\\.\\*b'
|
|
1748
|
-
*
|
|
1749
|
-
* escapeWordsOutsideTokens('{{naql}}.test')
|
|
1750
|
-
* // → '{{naql}}\\.test'
|
|
1751
|
-
*
|
|
1752
|
-
* escapeWordsOutsideTokens('(literal)')
|
|
1753
|
-
* // → '(literal)' (not escaped here - processPattern handles it)
|
|
1754
|
-
*/
|
|
1755
|
-
const escapeWordsOutsideTokens = (word) => word.split(/(\{\{[^}]+\}\})/g).map((part) => part.startsWith("{{") && part.endsWith("}}") ? part : part.replace(/[.*+?^${}|\\]/g, "\\$&")).join("");
|
|
1756
|
-
/**
|
|
1757
|
-
* Normalizes a breakpoint to the object form.
|
|
1758
|
-
* Strings are converted to { pattern: str, split: 'after' } with no constraints.
|
|
1759
|
-
* Invalid `split` values are treated as `'after'` for backward compatibility.
|
|
1760
|
-
* If both `pattern` and `regex` are specified, `regex` takes precedence.
|
|
1761
|
-
*
|
|
1762
|
-
* When `words` is specified:
|
|
1763
|
-
* - Defaults `split` to `'at'` (can be overridden)
|
|
1764
|
-
* - Throws if combined with `pattern` or `regex`
|
|
1765
|
-
*
|
|
1766
|
-
* @param bp - Breakpoint as string or object
|
|
1767
|
-
* @returns Normalized BreakpointRule object with resolved pattern/regex
|
|
1768
|
-
*
|
|
1769
|
-
* @example
|
|
1770
|
-
* normalizeBreakpoint('\\n\\n')
|
|
1771
|
-
* // → { pattern: '\\n\\n', split: 'after' }
|
|
1772
|
-
*
|
|
1773
|
-
* normalizeBreakpoint({ pattern: '\\n', min: 10 })
|
|
1774
|
-
* // → { pattern: '\\n', min: 10, split: 'after' }
|
|
1775
|
-
*
|
|
1776
|
-
* normalizeBreakpoint({ pattern: 'X', split: 'at' })
|
|
1777
|
-
* // → { pattern: 'X', split: 'at' }
|
|
1778
|
-
*
|
|
1779
|
-
* normalizeBreakpoint({ words: ['فهذا', 'ثم'] })
|
|
1780
|
-
* // → { words: ['فهذا', 'ثم'], split: 'at' }
|
|
1781
|
-
*/
|
|
1782
|
-
const normalizeBreakpoint = (bp) => {
|
|
1783
|
-
if (typeof bp === "string") return {
|
|
1784
|
-
pattern: bp,
|
|
1785
|
-
split: "after"
|
|
1786
|
-
};
|
|
1787
|
-
if (bp.words && (bp.pattern !== void 0 || bp.regex !== void 0)) throw new Error("BreakpointRule: \"words\" cannot be combined with \"pattern\" or \"regex\"");
|
|
1788
|
-
const defaultSplit = bp.words ? "at" : "after";
|
|
1789
|
-
const split = bp.split === "at" || bp.split === "after" ? bp.split : defaultSplit;
|
|
1790
|
-
return {
|
|
1791
|
-
...bp,
|
|
1792
|
-
split
|
|
1793
|
-
};
|
|
1794
|
-
};
|
|
1795
|
-
/**
|
|
1796
|
-
* Checks if a page ID is in an excluded list (single pages or ranges).
|
|
1797
|
-
*
|
|
1798
|
-
* @param pageId - Page ID to check
|
|
1799
|
-
* @param excludeList - List of page IDs or [from, to] ranges to exclude
|
|
1800
|
-
* @returns True if page is excluded
|
|
1801
|
-
*
|
|
1802
|
-
* @example
|
|
1803
|
-
* isPageExcluded(5, [1, 5, 10])
|
|
1804
|
-
* // → true
|
|
1805
|
-
*
|
|
1806
|
-
* isPageExcluded(5, [[3, 7]])
|
|
1807
|
-
* // → true
|
|
1808
|
-
*
|
|
1809
|
-
* isPageExcluded(5, [[10, 20]])
|
|
1810
|
-
* // → false
|
|
1811
|
-
*/
|
|
1812
|
-
const isPageExcluded = (pageId, excludeList) => excludeList?.some((item) => typeof item === "number" ? pageId === item : pageId >= item[0] && pageId <= item[1]) ?? false;
|
|
1813
|
-
/**
|
|
1814
|
-
* Checks if a page ID is within a breakpoint's min/max range and not excluded.
|
|
1815
|
-
*
|
|
1816
|
-
* @param pageId - Page ID to check
|
|
1817
|
-
* @param rule - Breakpoint rule with optional min/max/exclude constraints
|
|
1818
|
-
* @returns True if page is within valid range
|
|
1819
|
-
*
|
|
1820
|
-
* @example
|
|
1821
|
-
* isInBreakpointRange(50, { pattern: '\\n', min: 10, max: 100 })
|
|
1822
|
-
* // → true
|
|
1823
|
-
*
|
|
1824
|
-
* isInBreakpointRange(5, { pattern: '\\n', min: 10 })
|
|
1825
|
-
* // → false (below min)
|
|
1826
|
-
*/
|
|
1827
|
-
const isInBreakpointRange = (pageId, rule) => {
|
|
1828
|
-
const { min, max, exclude } = rule;
|
|
1829
|
-
return (min === void 0 || pageId >= min) && (max === void 0 || pageId <= max) && !isPageExcluded(pageId, exclude);
|
|
1830
|
-
};
|
|
1831
|
-
/**
|
|
1832
|
-
* Builds an exclude set from a PageRange array for O(1) lookups.
|
|
1833
|
-
*
|
|
1834
|
-
* @param excludeList - List of page IDs or [from, to] ranges
|
|
1835
|
-
* @returns Set of all excluded page IDs
|
|
1836
|
-
*
|
|
1837
|
-
* @remarks
|
|
1838
|
-
* This expands ranges into explicit page IDs for fast membership checks. For typical
|
|
1839
|
-
* book-scale inputs (thousands of pages), this is small and keeps downstream logic
|
|
1840
|
-
* simple and fast. If you expect extremely large ranges (e.g., millions of pages),
|
|
1841
|
-
* consider avoiding broad excludes or introducing a range-based membership structure.
|
|
1842
|
-
*
|
|
1843
|
-
* @example
|
|
1844
|
-
* buildExcludeSet([1, 5, [10, 12]])
|
|
1845
|
-
* // → Set { 1, 5, 10, 11, 12 }
|
|
1846
|
-
*/
|
|
1847
|
-
const buildExcludeSet = (excludeList) => {
|
|
1848
|
-
const excludeSet = /* @__PURE__ */ new Set();
|
|
1849
|
-
for (const item of excludeList || []) if (typeof item === "number") excludeSet.add(item);
|
|
1850
|
-
else for (let i = item[0]; i <= item[1]; i++) excludeSet.add(i);
|
|
1851
|
-
return excludeSet;
|
|
1852
|
-
};
|
|
1853
|
-
/**
|
|
1854
|
-
* Creates a segment with optional to and meta fields.
|
|
1855
|
-
* Returns null if content is empty after trimming.
|
|
1856
|
-
*
|
|
1857
|
-
* @param content - Segment content
|
|
1858
|
-
* @param fromPageId - Starting page ID
|
|
1859
|
-
* @param toPageId - Optional ending page ID (omitted if same as from)
|
|
1860
|
-
* @param meta - Optional metadata to attach
|
|
1861
|
-
* @returns Segment object or null if empty
|
|
1862
|
-
*
|
|
1863
|
-
* @example
|
|
1864
|
-
* createSegment('Hello world', 1, 3, { chapter: 1 })
|
|
1865
|
-
* // → { content: 'Hello world', from: 1, to: 3, meta: { chapter: 1 } }
|
|
1866
|
-
*
|
|
1867
|
-
* createSegment(' ', 1, undefined, undefined)
|
|
1868
|
-
* // → null (empty content)
|
|
1869
|
-
*/
|
|
1870
|
-
const createSegment = (content, fromPageId, toPageId, meta) => {
|
|
1871
|
-
const trimmed = content.trim();
|
|
1872
|
-
if (!trimmed) return null;
|
|
1873
|
-
return {
|
|
1874
|
-
content: trimmed,
|
|
1875
|
-
from: fromPageId,
|
|
1876
|
-
...toPageId !== void 0 && toPageId !== fromPageId && { to: toPageId },
|
|
1877
|
-
...meta && { meta }
|
|
1878
|
-
};
|
|
1879
|
-
};
|
|
1880
|
-
/**
|
|
1881
|
-
* Expands breakpoint patterns and pre-computes exclude sets.
|
|
1882
|
-
*
|
|
1883
|
-
* @param breakpoints - Array of breakpoint patterns or rules
|
|
1884
|
-
* @param processPattern - Function to expand tokens in patterns
|
|
1885
|
-
* @returns Array of expanded breakpoints with compiled regexes
|
|
1886
|
-
*
|
|
1887
|
-
* @remarks
|
|
1888
|
-
* This function compiles regex patterns dynamically. This can be a ReDoS vector
|
|
1889
|
-
* if patterns come from untrusted sources. In typical usage, breakpoint rules
|
|
1890
|
-
* are application configuration, not user input.
|
|
1891
|
-
/**
|
|
1892
|
-
* @param processPattern - Function to expand tokens in patterns (with bracket escaping)
|
|
1893
|
-
* @param processRawPattern - Function to expand tokens without bracket escaping (for regex field)
|
|
1894
|
-
*/
|
|
1895
|
-
/**
|
|
1896
|
-
* Builds regex source from words array.
|
|
1897
|
-
* Words are escaped, processed, sorted by length, and joined with alternation.
|
|
1898
|
-
*/
|
|
1899
|
-
const buildWordsRegex = (words, processPattern) => {
|
|
1900
|
-
const processed = words.map((w, i) => ({
|
|
1901
|
-
originalIndex: i,
|
|
1902
|
-
w: w.trimStart()
|
|
1903
|
-
})).filter(({ w }) => w.length > 0).map(({ w, originalIndex }) => ({
|
|
1904
|
-
originalIndex,
|
|
1905
|
-
pattern: processPattern(escapeWordsOutsideTokens(w))
|
|
1906
|
-
}));
|
|
1907
|
-
if (processed.length === 0) return null;
|
|
1908
|
-
const seen = /* @__PURE__ */ new Set();
|
|
1909
|
-
const unique = [];
|
|
1910
|
-
for (const item of processed) if (!seen.has(item.pattern)) {
|
|
1911
|
-
seen.add(item.pattern);
|
|
1912
|
-
unique.push(item);
|
|
1913
|
-
}
|
|
1914
|
-
unique.sort((a, b) => b.pattern.length - a.pattern.length);
|
|
1915
|
-
return `\\s+(?:${unique.map((item) => `(?<_w${item.originalIndex}>${item.pattern})`).join("|")})`;
|
|
1916
|
-
};
|
|
1917
|
-
/** Compiles skipWhen pattern to regex, or null if not present. */
|
|
1918
|
-
const compileSkipWhenRegex = (rule, processPattern) => {
|
|
1919
|
-
if (rule.skipWhen === void 0) return null;
|
|
1920
|
-
const expandedSkip = processPattern(rule.skipWhen);
|
|
1921
|
-
try {
|
|
1922
|
-
return new RegExp(expandedSkip, "mu");
|
|
1923
|
-
} catch (error) {
|
|
1924
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
1925
|
-
throw new Error(`Invalid breakpoint skipWhen regex: ${rule.skipWhen}\n Cause: ${message}`);
|
|
1926
|
-
}
|
|
1927
|
-
};
|
|
1928
|
-
/** Compiles a regex from a pattern string, throws descriptive error on failure. */
|
|
1929
|
-
const compilePatternRegex = (pattern, fieldName) => {
|
|
1930
|
-
try {
|
|
1931
|
-
return new RegExp(pattern, "gmu");
|
|
1932
|
-
} catch (error) {
|
|
1933
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
1934
|
-
throw new Error(`Invalid breakpoint ${fieldName}: ${pattern}\n Cause: ${message}`);
|
|
1935
|
-
}
|
|
1936
|
-
};
|
|
1937
|
-
/** Expands a single breakpoint to its expanded form. */
|
|
1938
|
-
const expandSingleBreakpoint = (bp, processPattern, processRawPattern) => {
|
|
1939
|
-
const rule = normalizeBreakpoint(bp);
|
|
1940
|
-
const excludeSet = buildExcludeSet(rule.exclude);
|
|
1941
|
-
const skipWhenRegex = compileSkipWhenRegex(rule, processPattern);
|
|
1942
|
-
if (rule.words !== void 0) {
|
|
1943
|
-
const wordsPattern = buildWordsRegex(rule.words, processPattern);
|
|
1944
|
-
if (wordsPattern === null) return null;
|
|
1945
|
-
return {
|
|
1946
|
-
excludeSet,
|
|
1947
|
-
regex: compilePatternRegex(wordsPattern, `words: ${rule.words.join(", ")}`),
|
|
1948
|
-
rule,
|
|
1949
|
-
skipWhenRegex,
|
|
1950
|
-
splitAt: rule.split === "at"
|
|
1951
|
-
};
|
|
1952
|
-
}
|
|
1953
|
-
const rawPattern = rule.regex ?? rule.pattern;
|
|
1954
|
-
if (rawPattern === "" || rawPattern === void 0) return {
|
|
1955
|
-
excludeSet,
|
|
1956
|
-
regex: null,
|
|
1957
|
-
rule,
|
|
1958
|
-
skipWhenRegex,
|
|
1959
|
-
splitAt: false
|
|
1960
|
-
};
|
|
1961
|
-
return {
|
|
1962
|
-
excludeSet,
|
|
1963
|
-
regex: compilePatternRegex(rule.regex !== void 0 && processRawPattern ? processRawPattern(rawPattern) : processPattern(rawPattern), rule.regex !== void 0 ? "regex" : "pattern"),
|
|
1964
|
-
rule,
|
|
1965
|
-
skipWhenRegex,
|
|
1966
|
-
splitAt: rule.split === "at"
|
|
1967
|
-
};
|
|
1968
|
-
};
|
|
1969
|
-
const expandBreakpoints = (breakpoints, processPattern, processRawPattern) => breakpoints.map((bp) => expandSingleBreakpoint(bp, processPattern, processRawPattern)).filter((bp) => bp !== null);
|
|
1970
|
-
/**
|
|
1971
|
-
* Applies a configured joiner at detected page boundaries within a multi-page content chunk.
|
|
1972
|
-
*
|
|
1973
|
-
* This is used for breakpoint-generated segments which don't have access to the original
|
|
1974
|
-
* `pageMap.pageBreaks` offsets. We detect page starts sequentially by searching for each page's
|
|
1975
|
-
* prefix after the previous boundary, then replace ONLY the single newline immediately before
|
|
1976
|
-
* that page start.
|
|
1977
|
-
*
|
|
1978
|
-
* This avoids converting real in-page newlines, while still normalizing page joins consistently.
|
|
1979
|
-
*/
|
|
1980
|
-
const applyPageJoinerBetweenPages = (content, fromIdx, toIdx, pageIds, normalizedPages, joiner) => {
|
|
1981
|
-
if (joiner === "newline" || fromIdx >= toIdx || !content.includes("\n")) return content;
|
|
1982
|
-
let updated = content;
|
|
1983
|
-
let searchFrom = 0;
|
|
1984
|
-
for (let pi = fromIdx + 1; pi <= toIdx; pi++) {
|
|
1985
|
-
const pageData = normalizedPages.get(pageIds[pi]);
|
|
1986
|
-
if (!pageData) continue;
|
|
1987
|
-
const found = findPrefixPositionInContent(updated, pageData.content.trimStart(), searchFrom);
|
|
1988
|
-
if (found > 0 && updated[found - 1] === "\n") updated = `${updated.slice(0, found - 1)} ${updated.slice(found)}`;
|
|
1989
|
-
if (found > 0) searchFrom = found;
|
|
1990
|
-
}
|
|
1991
|
-
return updated;
|
|
1992
|
-
};
|
|
1993
|
-
/**
|
|
1994
|
-
* Finds the position of a page prefix in content, trying multiple prefix lengths.
|
|
1995
|
-
*/
|
|
1996
|
-
const findPrefixPositionInContent = (content, trimmedPageContent, searchFrom) => {
|
|
1997
|
-
for (const len of JOINER_PREFIX_LENGTHS) {
|
|
1998
|
-
const prefix = trimmedPageContent.slice(0, Math.min(len, trimmedPageContent.length)).trim();
|
|
1999
|
-
if (!prefix) continue;
|
|
2000
|
-
const pos = content.indexOf(prefix, searchFrom);
|
|
2001
|
-
if (pos > 0) return pos;
|
|
2002
|
-
}
|
|
2003
|
-
return -1;
|
|
2004
|
-
};
|
|
2005
|
-
/**
|
|
2006
|
-
* Estimates how far into the current page `remainingContent` begins.
|
|
2007
|
-
*
|
|
2008
|
-
* During breakpoint processing, `remainingContent` can begin mid-page after a previous split.
|
|
2009
|
-
* When that happens, raw cumulative page offsets (computed from full page starts) can overestimate
|
|
2010
|
-
* expected boundary positions. This helper computes an approximate starting offset by matching
|
|
2011
|
-
* a short prefix of `remainingContent` inside the current page content.
|
|
2012
|
-
*/
|
|
2013
|
-
const estimateStartOffsetInCurrentPage = (remainingContent, currentFromIdx, pageIds, normalizedPages) => {
|
|
2014
|
-
const currentPageData = normalizedPages.get(pageIds[currentFromIdx]);
|
|
2015
|
-
if (!currentPageData) return 0;
|
|
2016
|
-
const remPrefix = remainingContent.slice(0, 500).trimStart();
|
|
2017
|
-
if (!remPrefix) return 0;
|
|
2018
|
-
const maxNeedleLen = Math.min(30, remPrefix.length);
|
|
2019
|
-
for (let len = maxNeedleLen; len >= 5; len -= 5) {
|
|
2020
|
-
const needle = remPrefix.slice(0, len);
|
|
2021
|
-
const idx = currentPageData.content.indexOf(needle);
|
|
2022
|
-
if (idx >= 0) return idx;
|
|
2023
|
-
}
|
|
2024
|
-
if (remPrefix.length >= 3) {
|
|
2025
|
-
const needle = remPrefix.slice(0, 3);
|
|
2026
|
-
const idx = currentPageData.content.indexOf(needle);
|
|
2027
|
-
if (idx >= 0) return idx;
|
|
2028
|
-
}
|
|
2029
|
-
return 0;
|
|
2030
|
-
};
|
|
2031
|
-
const estimateStartOffsetInCurrentPageFromEnd = (remainingContent, currentFromIdx, pageIds, normalizedPages) => {
|
|
2032
|
-
const currentPageData = normalizedPages.get(pageIds[currentFromIdx]);
|
|
2033
|
-
if (!currentPageData) return 0;
|
|
2034
|
-
const remPrefix = remainingContent.slice(0, 500).trimStart();
|
|
2035
|
-
if (!remPrefix) return 0;
|
|
2036
|
-
const maxNeedleLen = Math.min(30, remPrefix.length);
|
|
2037
|
-
for (let len = maxNeedleLen; len >= 5; len -= 5) {
|
|
2038
|
-
const needle = remPrefix.slice(0, len);
|
|
2039
|
-
const idx = currentPageData.content.lastIndexOf(needle);
|
|
2040
|
-
if (idx >= 0) return idx;
|
|
2041
|
-
}
|
|
2042
|
-
if (remPrefix.length >= 3) {
|
|
2043
|
-
const needle = remPrefix.slice(0, 3);
|
|
2044
|
-
const idx = currentPageData.content.lastIndexOf(needle);
|
|
2045
|
-
if (idx >= 0) return idx;
|
|
2046
|
-
}
|
|
2047
|
-
return 0;
|
|
2048
|
-
};
|
|
2049
|
-
const selectStartOffsetInCurrentPage = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger) => {
|
|
2050
|
-
const first = estimateStartOffsetInCurrentPage(segmentContent, fromIdx, pageIds, normalizedPages);
|
|
2051
|
-
const last = estimateStartOffsetInCurrentPageFromEnd(segmentContent, fromIdx, pageIds, normalizedPages);
|
|
2052
|
-
const candidates = [...new Set([first, last])];
|
|
2053
|
-
if (candidates.length <= 1 || fromIdx + 1 > toIdx) return candidates[0] ?? 0;
|
|
2054
|
-
const rawBoundary = cumulativeOffsets[fromIdx + 1] !== void 0 && cumulativeOffsets[fromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[fromIdx + 1] - cumulativeOffsets[fromIdx]) : void 0;
|
|
2055
|
-
if (rawBoundary === void 0) return candidates[0] ?? 0;
|
|
2056
|
-
let best = candidates[0] ?? 0;
|
|
2057
|
-
let bestScore = Number.POSITIVE_INFINITY;
|
|
2058
|
-
for (const candidate of candidates) {
|
|
2059
|
-
const expectedBoundary = Math.max(0, rawBoundary - candidate);
|
|
2060
|
-
const pos = findPageStartNearExpectedBoundary(segmentContent, fromIdx + 1, expectedBoundary, pageIds, normalizedPages, logger);
|
|
2061
|
-
if (pos > 0) {
|
|
2062
|
-
const score = Math.abs(pos - expectedBoundary);
|
|
2063
|
-
if (score < bestScore) {
|
|
2064
|
-
bestScore = score;
|
|
2065
|
-
best = candidate;
|
|
2066
|
-
}
|
|
2067
|
-
}
|
|
2068
|
-
}
|
|
2069
|
-
return best;
|
|
2070
|
-
};
|
|
2071
|
-
/**
|
|
2072
|
-
* Attempts to find the start position of a target page within remainingContent,
|
|
2073
|
-
* anchored near an expected boundary position to reduce collisions.
|
|
2074
|
-
*
|
|
2075
|
-
* This is used to define breakpoint windows in terms of actual content being split, rather than
|
|
2076
|
-
* raw per-page offsets which can desync when structural rules strip markers.
|
|
2077
|
-
*/
|
|
2078
|
-
const findPageStartNearExpectedBoundary = (remainingContent, targetPageIdx, expectedBoundary, pageIds, normalizedPages, logger) => {
|
|
2079
|
-
const targetPageData = normalizedPages.get(pageIds[targetPageIdx]);
|
|
2080
|
-
if (!targetPageData) return -1;
|
|
2081
|
-
const approx = Math.min(Math.max(0, expectedBoundary), remainingContent.length);
|
|
2082
|
-
const searchStart = Math.max(0, approx - 1e4);
|
|
2083
|
-
const searchEnd = Math.min(remainingContent.length, approx + 2e3);
|
|
2084
|
-
const targetTrimmed = targetPageData.content.trimStart();
|
|
2085
|
-
const ignoreDeviation = expectedBoundary >= remainingContent.length;
|
|
2086
|
-
const scanStart = ignoreDeviation ? 0 : searchStart;
|
|
2087
|
-
const scanEnd = ignoreDeviation ? remainingContent.length : searchEnd;
|
|
2088
|
-
const expectedForRanking = ignoreDeviation ? 0 : expectedBoundary;
|
|
2089
|
-
for (const len of WINDOW_PREFIX_LENGTHS) {
|
|
2090
|
-
const prefix = targetTrimmed.slice(0, Math.min(len, targetTrimmed.length)).trim();
|
|
2091
|
-
if (!prefix) continue;
|
|
2092
|
-
const candidates = findAnchorCandidates(remainingContent, prefix, scanStart, scanEnd);
|
|
2093
|
-
if (candidates.length === 0) continue;
|
|
2094
|
-
const deviationLimit = ignoreDeviation ? Number.POSITIVE_INFINITY : MAX_DEVIATION;
|
|
2095
|
-
const inRange = candidates.filter((c) => Math.abs(c.pos - expectedBoundary) <= deviationLimit);
|
|
2096
|
-
if (inRange.length > 0) return selectBestAnchor(inRange, expectedForRanking).pos;
|
|
2097
|
-
const bestOverall = selectBestAnchor(candidates, expectedForRanking);
|
|
2098
|
-
logger?.debug?.("[breakpoints] findPageStartNearExpectedBoundary: Rejected match exceeding deviation", {
|
|
2099
|
-
bestDistance: Math.abs(bestOverall.pos - expectedForRanking),
|
|
2100
|
-
expectedBoundary,
|
|
2101
|
-
matchPos: bestOverall.pos,
|
|
2102
|
-
maxDeviation: deviationLimit,
|
|
2103
|
-
prefixLength: len,
|
|
2104
|
-
targetPageIdx
|
|
2105
|
-
});
|
|
2106
|
-
}
|
|
2107
|
-
return -1;
|
|
2108
|
-
};
|
|
2109
|
-
/** Finds all whitespace-preceded occurrences of a prefix within a search range */
|
|
2110
|
-
const findAnchorCandidates = (content, prefix, start, end) => {
|
|
2111
|
-
const candidates = [];
|
|
2112
|
-
let pos = content.indexOf(prefix, start);
|
|
2113
|
-
while (pos !== -1 && pos <= end) {
|
|
2114
|
-
if (pos > 0) {
|
|
2115
|
-
const charBefore = content[pos - 1];
|
|
2116
|
-
if (charBefore === "\n") candidates.push({
|
|
2117
|
-
isNewline: true,
|
|
2118
|
-
pos
|
|
2119
|
-
});
|
|
2120
|
-
else if (/\s/.test(charBefore)) candidates.push({
|
|
2121
|
-
isNewline: false,
|
|
2122
|
-
pos
|
|
2123
|
-
});
|
|
2124
|
-
}
|
|
2125
|
-
pos = content.indexOf(prefix, pos + 1);
|
|
2126
|
-
}
|
|
2127
|
-
return candidates;
|
|
2128
|
-
};
|
|
2129
|
-
/** Selects the best anchor candidate, prioritizing newlines then proximity to boundary */
|
|
2130
|
-
const selectBestAnchor = (candidates, expectedBoundary) => {
|
|
2131
|
-
return candidates.reduce((best, curr) => {
|
|
2132
|
-
const bestScore = Math.abs(best.pos - expectedBoundary) + (best.isNewline ? 0 : 20);
|
|
2133
|
-
return Math.abs(curr.pos - expectedBoundary) + (curr.isNewline ? 0 : 20) < bestScore ? curr : best;
|
|
2134
|
-
});
|
|
2135
|
-
};
|
|
2136
|
-
/**
|
|
2137
|
-
* Finds the start position of a target page after a minimum position.
|
|
2138
|
-
* Used to avoid duplicate earlier matches when content repeats.
|
|
2139
|
-
*/
|
|
2140
|
-
const findPageStartAfterPosition = (remainingContent, targetPageIdx, minPos, pageIds, normalizedPages) => {
|
|
2141
|
-
const targetPageData = normalizedPages.get(pageIds[targetPageIdx]);
|
|
2142
|
-
if (!targetPageData) return -1;
|
|
2143
|
-
const targetTrimmed = targetPageData.content.trimStart();
|
|
2144
|
-
for (const len of WINDOW_PREFIX_LENGTHS) {
|
|
2145
|
-
const prefix = targetTrimmed.slice(0, Math.min(len, targetTrimmed.length)).trim();
|
|
2146
|
-
if (!prefix) continue;
|
|
2147
|
-
const after = findAnchorCandidates(remainingContent, prefix, Math.max(0, minPos), remainingContent.length).filter((c) => c.pos > minPos);
|
|
2148
|
-
if (after.length > 0) return selectBestAnchor(after, minPos).pos;
|
|
2149
|
-
}
|
|
2150
|
-
return -1;
|
|
2151
|
-
};
|
|
2152
|
-
const buildBoundaryPositionsFastPath = (segmentContent, fromIdx, toIdx, pageCount, cumulativeOffsets, logger) => {
|
|
2153
|
-
const boundaryPositions = [0];
|
|
2154
|
-
logger?.debug?.("[breakpoints] Using fast-path for large segment in buildBoundaryPositions", {
|
|
2155
|
-
fromIdx,
|
|
2156
|
-
pageCount,
|
|
2157
|
-
toIdx
|
|
2158
|
-
});
|
|
2159
|
-
const baseOffset = cumulativeOffsets[fromIdx] ?? 0;
|
|
2160
|
-
for (let i = fromIdx + 1; i <= toIdx; i++) {
|
|
2161
|
-
const offset = cumulativeOffsets[i];
|
|
2162
|
-
if (offset !== void 0) {
|
|
2163
|
-
const boundary = Math.max(0, offset - baseOffset);
|
|
2164
|
-
const prevBoundary = boundaryPositions[boundaryPositions.length - 1];
|
|
2165
|
-
boundaryPositions.push(Math.max(prevBoundary + 1, Math.min(boundary, segmentContent.length)));
|
|
2166
|
-
}
|
|
2167
|
-
}
|
|
2168
|
-
boundaryPositions.push(segmentContent.length);
|
|
2169
|
-
return boundaryPositions;
|
|
2170
|
-
};
|
|
2171
|
-
const isBoundaryPositionValid = (pos, prevBoundary, expectedBoundary, segmentLength, ignoreDeviation = false) => {
|
|
2172
|
-
if (pos <= 0 || pos <= prevBoundary) return false;
|
|
2173
|
-
if (ignoreDeviation) return true;
|
|
2174
|
-
if (expectedBoundary >= segmentLength) return true;
|
|
2175
|
-
const deviationLimit = MAX_DEVIATION;
|
|
2176
|
-
return Math.abs(pos - expectedBoundary) < deviationLimit;
|
|
2177
|
-
};
|
|
2178
|
-
const resolveBoundaryMatch = (segmentContent, pageIdx, rawBoundary, startOffsetInFromPage, canInferStartOffset, pageIds, normalizedPages, logger) => {
|
|
2179
|
-
let expectedBoundary = rawBoundary !== void 0 ? Math.max(0, rawBoundary - startOffsetInFromPage) : segmentContent.length;
|
|
2180
|
-
let pos = findPageStartNearExpectedBoundary(segmentContent, pageIdx, expectedBoundary, pageIds, normalizedPages, logger);
|
|
2181
|
-
let didInferStartOffset = false;
|
|
2182
|
-
if (pos < 0 && canInferStartOffset && rawBoundary !== void 0) {
|
|
2183
|
-
const relaxedPos = findPageStartNearExpectedBoundary(segmentContent, pageIdx, segmentContent.length, pageIds, normalizedPages, logger);
|
|
2184
|
-
if (relaxedPos > 0) {
|
|
2185
|
-
const inferredStartOffset = rawBoundary - relaxedPos;
|
|
2186
|
-
const currentExpected = Math.max(0, rawBoundary - startOffsetInFromPage);
|
|
2187
|
-
if (inferredStartOffset >= 0 && Math.abs(relaxedPos - currentExpected) < 500) {
|
|
2188
|
-
startOffsetInFromPage = inferredStartOffset;
|
|
2189
|
-
expectedBoundary = Math.max(0, rawBoundary - startOffsetInFromPage);
|
|
2190
|
-
pos = relaxedPos;
|
|
2191
|
-
didInferStartOffset = true;
|
|
2192
|
-
}
|
|
2193
|
-
}
|
|
2194
|
-
}
|
|
2195
|
-
return {
|
|
2196
|
-
didInferStartOffset,
|
|
2197
|
-
expectedBoundary,
|
|
2198
|
-
pos,
|
|
2199
|
-
startOffsetInFromPage
|
|
2200
|
-
};
|
|
2201
|
-
};
|
|
2202
|
-
const buildBoundaryPositionsAccurate = (segmentContent, fromIdx, toIdx, pageCount, pageIds, normalizedPages, cumulativeOffsets, logger) => {
|
|
2203
|
-
const boundaryPositions = [0];
|
|
2204
|
-
logger?.debug?.("[breakpoints] buildBoundaryPositions: Using accurate string-search path", {
|
|
2205
|
-
contentLength: segmentContent.length,
|
|
2206
|
-
fromIdx,
|
|
2207
|
-
pageCount,
|
|
2208
|
-
toIdx
|
|
2209
|
-
});
|
|
2210
|
-
let startOffsetInFromPage = selectStartOffsetInCurrentPage(segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger);
|
|
2211
|
-
let didInferStartOffset = false;
|
|
2212
|
-
for (let i = fromIdx + 1; i <= toIdx; i++) {
|
|
2213
|
-
const rawBoundary = cumulativeOffsets[i] !== void 0 && cumulativeOffsets[fromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[i] - cumulativeOffsets[fromIdx]) : void 0;
|
|
2214
|
-
const resolved = resolveBoundaryMatch(segmentContent, i, rawBoundary, startOffsetInFromPage, !didInferStartOffset && i === fromIdx + 1, pageIds, normalizedPages, logger);
|
|
2215
|
-
startOffsetInFromPage = resolved.startOffsetInFromPage;
|
|
2216
|
-
didInferStartOffset = didInferStartOffset || resolved.didInferStartOffset;
|
|
2217
|
-
const prevBoundary = boundaryPositions[boundaryPositions.length - 1];
|
|
2218
|
-
let resolvedPos = resolved.pos;
|
|
2219
|
-
if (resolvedPos <= prevBoundary) {
|
|
2220
|
-
const afterPos = findPageStartAfterPosition(segmentContent, i, prevBoundary + 1, pageIds, normalizedPages);
|
|
2221
|
-
if (afterPos > prevBoundary) resolvedPos = afterPos;
|
|
2222
|
-
}
|
|
2223
|
-
if (isBoundaryPositionValid(resolvedPos, prevBoundary, resolved.expectedBoundary, segmentContent.length)) boundaryPositions.push(resolvedPos);
|
|
2224
|
-
else {
|
|
2225
|
-
const estimate = Math.max(prevBoundary + 1, resolved.expectedBoundary);
|
|
2226
|
-
boundaryPositions.push(Math.min(estimate, segmentContent.length));
|
|
2227
|
-
}
|
|
2228
|
-
}
|
|
2229
|
-
boundaryPositions.push(segmentContent.length);
|
|
2230
|
-
logger?.debug?.("[breakpoints] buildBoundaryPositions: Complete", { boundaryCount: boundaryPositions.length });
|
|
2231
|
-
return boundaryPositions;
|
|
2232
|
-
};
|
|
2233
|
-
/**
|
|
2234
|
-
* Builds a boundary position map for pages within the given range.
|
|
2235
|
-
*
|
|
2236
|
-
* This function computes page boundaries once per segment and enables
|
|
2237
|
-
* O(log n) page lookups via binary search with `findPageIndexForPosition`.
|
|
2238
|
-
*
|
|
2239
|
-
* Boundaries are derived from segmentContent (post-structural-rules).
|
|
2240
|
-
* When the segment starts mid-page, an offset correction is applied to
|
|
2241
|
-
* keep boundary estimates aligned with the segment's actual content space.
|
|
2242
|
-
*
|
|
2243
|
-
* @param segmentContent - Full segment content (already processed by structural rules)
|
|
2244
|
-
* @param fromIdx - Starting page index
|
|
2245
|
-
* @param toIdx - Ending page index
|
|
2246
|
-
* @param pageIds - Array of all page IDs
|
|
2247
|
-
* @param normalizedPages - Map of page ID to normalized content
|
|
2248
|
-
* @param cumulativeOffsets - Cumulative character offsets (for estimates)
|
|
2249
|
-
* @param logger - Optional logger for debugging
|
|
2250
|
-
* @returns Array where boundaryPositions[i] = start position of page (fromIdx + i),
|
|
2251
|
-
* with a sentinel boundary at segmentContent.length as the last element
|
|
2252
|
-
*
|
|
2253
|
-
* @example
|
|
2254
|
-
* // For a 3-page segment:
|
|
2255
|
-
* buildBoundaryPositions(content, 0, 2, pageIds, normalizedPages, offsets)
|
|
2256
|
-
* // → [0, 23, 45, 67] where 67 is content.length (sentinel)
|
|
2257
|
-
*/
|
|
2258
|
-
const buildBoundaryPositions = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger) => {
|
|
2259
|
-
const pageCount = toIdx - fromIdx + 1;
|
|
2260
|
-
const expectedLength = (cumulativeOffsets[toIdx + 1] ?? 0) - (cumulativeOffsets[fromIdx] ?? 0);
|
|
2261
|
-
if (pageCount >= 1e3 && segmentContent.length === expectedLength) return buildBoundaryPositionsFastPath(segmentContent, fromIdx, toIdx, pageCount, cumulativeOffsets, logger);
|
|
2262
|
-
return buildBoundaryPositionsAccurate(segmentContent, fromIdx, toIdx, pageCount, pageIds, normalizedPages, cumulativeOffsets, logger);
|
|
2263
|
-
};
|
|
2264
|
-
/**
|
|
2265
|
-
* Binary search to find which page a position falls within.
|
|
2266
|
-
* Uses "largest i where boundaryPositions[i] <= position" semantics.
|
|
2267
|
-
*
|
|
2268
|
-
* @param position - Character position in segmentContent
|
|
2269
|
-
* @param boundaryPositions - Precomputed boundary positions (from buildBoundaryPositions)
|
|
2270
|
-
* @param fromIdx - Base page index (boundaryPositions[0] corresponds to pageIds[fromIdx])
|
|
2271
|
-
* @returns Page index in pageIds array
|
|
2272
|
-
*
|
|
2273
|
-
* @example
|
|
2274
|
-
* // With boundaries [0, 20, 40, 60] and fromIdx=0:
|
|
2275
|
-
* findPageIndexForPosition(15, boundaries, 0) // → 0 (first page)
|
|
2276
|
-
* findPageIndexForPosition(25, boundaries, 0) // → 1 (second page)
|
|
2277
|
-
* findPageIndexForPosition(40, boundaries, 0) // → 2 (exactly on boundary = that page)
|
|
2278
|
-
*/
|
|
2279
|
-
const findPageIndexForPosition = (position, boundaryPositions, fromIdx) => {
|
|
2280
|
-
if (boundaryPositions.length <= 1) return fromIdx;
|
|
2281
|
-
let left = 0;
|
|
2282
|
-
let right = boundaryPositions.length - 2;
|
|
2283
|
-
while (left < right) {
|
|
2284
|
-
const mid = Math.ceil((left + right) / 2);
|
|
2285
|
-
if (boundaryPositions[mid] <= position) left = mid;
|
|
2286
|
-
else right = mid - 1;
|
|
2287
|
-
}
|
|
2288
|
-
return fromIdx + left;
|
|
2289
|
-
};
|
|
2290
|
-
/**
|
|
2291
|
-
* Finds the end position of a breakpoint window inside `remainingContent`.
|
|
2292
|
-
*
|
|
2293
|
-
* The window end is defined as the start of the page AFTER `windowEndIdx` (i.e. `windowEndIdx + 1`),
|
|
2294
|
-
* found within the actual `remainingContent` string being split. This avoids relying on raw page offsets
|
|
2295
|
-
* that can diverge when structural rules strip markers (e.g. `lineStartsAfter`).
|
|
2296
|
-
*/
|
|
2297
|
-
const findBreakpointWindowEndPosition = (remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger) => {
|
|
2298
|
-
if (windowEndIdx >= toIdx) return remainingContent.length;
|
|
2299
|
-
const desiredNextIdx = windowEndIdx + 1;
|
|
2300
|
-
const minNextIdx = currentFromIdx + 1;
|
|
2301
|
-
const maxNextIdx = Math.min(desiredNextIdx, toIdx);
|
|
2302
|
-
const startOffsetInCurrentPage = estimateStartOffsetInCurrentPage(remainingContent, currentFromIdx, pageIds, normalizedPages);
|
|
2303
|
-
let bestExpectedBoundary = remainingContent.length;
|
|
2304
|
-
for (let nextIdx = maxNextIdx; nextIdx >= minNextIdx; nextIdx--) {
|
|
2305
|
-
const expectedBoundary = cumulativeOffsets[nextIdx] !== void 0 && cumulativeOffsets[currentFromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[nextIdx] - cumulativeOffsets[currentFromIdx] - startOffsetInCurrentPage) : remainingContent.length;
|
|
2306
|
-
if (nextIdx === maxNextIdx) bestExpectedBoundary = expectedBoundary;
|
|
2307
|
-
const pos = findPageStartNearExpectedBoundary(remainingContent, nextIdx, expectedBoundary, pageIds, normalizedPages, logger);
|
|
2308
|
-
if (pos > 0) return pos;
|
|
2309
|
-
}
|
|
2310
|
-
return Math.min(bestExpectedBoundary, remainingContent.length);
|
|
2311
|
-
};
|
|
2312
|
-
/**
|
|
2313
|
-
* Finds exclusion-based break position using raw cumulative offsets.
|
|
2314
|
-
*
|
|
2315
|
-
* This is used to ensure pages excluded by breakpoints are never merged into the same output segment.
|
|
2316
|
-
* Returns a break position relative to the start of `remainingContent` (i.e. the currentFromIdx start).
|
|
2317
|
-
*/
|
|
2318
|
-
const findExclusionBreakPosition = (currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets) => {
|
|
2319
|
-
const startingPageId = pageIds[currentFromIdx];
|
|
2320
|
-
if (expandedBreakpoints.some((bp) => bp.excludeSet.has(startingPageId)) && currentFromIdx < toIdx) return cumulativeOffsets[currentFromIdx + 1] - cumulativeOffsets[currentFromIdx];
|
|
2321
|
-
for (let pageIdx = currentFromIdx + 1; pageIdx <= windowEndIdx; pageIdx++) {
|
|
2322
|
-
const pageId = pageIds[pageIdx];
|
|
2323
|
-
if (expandedBreakpoints.some((bp) => bp.excludeSet.has(pageId))) return cumulativeOffsets[pageIdx] - cumulativeOffsets[currentFromIdx];
|
|
2324
|
-
}
|
|
2325
|
-
return -1;
|
|
2326
|
-
};
|
|
2327
|
-
/**
|
|
2328
|
-
* Checks if any page in a range is excluded by the given exclude set.
|
|
2329
|
-
*
|
|
2330
|
-
* @param excludeSet - Set of excluded page IDs
|
|
2331
|
-
* @param pageIds - Array of page IDs
|
|
2332
|
-
* @param fromIdx - Start index (inclusive)
|
|
2333
|
-
* @param toIdx - End index (inclusive)
|
|
2334
|
-
* @returns True if any page in range is excluded
|
|
2335
|
-
*/
|
|
2336
|
-
const hasExcludedPageInRange = (excludeSet, pageIds, fromIdx, toIdx) => {
|
|
2337
|
-
if (excludeSet.size === 0) return false;
|
|
2338
|
-
for (let pageIdx = fromIdx; pageIdx <= toIdx; pageIdx++) if (excludeSet.has(pageIds[pageIdx])) return true;
|
|
2339
|
-
return false;
|
|
2340
|
-
};
|
|
2341
|
-
/**
|
|
2342
|
-
* Finds the position of the next page content within remaining content.
|
|
2343
|
-
* Returns -1 if not found.
|
|
2344
|
-
*
|
|
2345
|
-
* @param remainingContent - Content to search in
|
|
2346
|
-
* @param nextPageData - Normalized data for the next page
|
|
2347
|
-
* @returns Position of next page content, or -1 if not found
|
|
2348
|
-
*/
|
|
2349
|
-
const findNextPagePosition = (remainingContent, nextPageData) => {
|
|
2350
|
-
const searchPrefix = nextPageData.content.trim().slice(0, Math.min(30, nextPageData.length));
|
|
2351
|
-
if (searchPrefix.length === 0) return -1;
|
|
2352
|
-
const pos = remainingContent.indexOf(searchPrefix);
|
|
2353
|
-
return pos > 0 ? pos : -1;
|
|
2354
|
-
};
|
|
2355
|
-
/**
|
|
2356
|
-
* Finds matches within a window and returns the selected position based on preference and split mode.
|
|
2357
|
-
*
|
|
2358
|
-
* @param windowContent - Content to search
|
|
2359
|
-
* @param regex - Regex to match
|
|
2360
|
-
* @param prefer - 'longer' for last match, 'shorter' for first match
|
|
2361
|
-
* @param splitAt - If true, return position BEFORE match (at index). If false, return position AFTER match (at index + length).
|
|
2362
|
-
* @returns Break position, or -1 if no valid matches
|
|
2363
|
-
*
|
|
2364
|
-
* @remarks
|
|
2365
|
-
* - Matches with length 0 are skipped (prevents infinite loops with lookahead patterns)
|
|
2366
|
-
* - Matches that would result in position 0 are skipped (prevents empty first segments)
|
|
2367
|
-
* - For prefer:'shorter', returns immediately on first valid match (optimization)
|
|
2368
|
-
*/
|
|
2369
|
-
const findPatternBreakPosition = (windowContent, regex, prefer, splitAt = false) => {
|
|
2370
|
-
let last;
|
|
2371
|
-
for (const m of windowContent.matchAll(regex)) {
|
|
2372
|
-
const idx = m.index ?? -1;
|
|
2373
|
-
const len = m[0]?.length ?? 0;
|
|
2374
|
-
if (idx < 0 || len === 0) continue;
|
|
2375
|
-
const pos = splitAt ? idx : idx + len;
|
|
2376
|
-
if (pos === 0) continue;
|
|
2377
|
-
last = {
|
|
2378
|
-
groups: m.groups,
|
|
2379
|
-
index: idx,
|
|
2380
|
-
length: len
|
|
2381
|
-
};
|
|
2382
|
-
if (prefer === "shorter") return {
|
|
2383
|
-
groups: m.groups,
|
|
2384
|
-
pos
|
|
2385
|
-
};
|
|
2386
|
-
}
|
|
2387
|
-
if (!last) return { pos: -1 };
|
|
2388
|
-
const finalPos = splitAt ? last.index : last.index + last.length;
|
|
2389
|
-
return {
|
|
2390
|
-
groups: last.groups,
|
|
2391
|
-
pos: finalPos
|
|
2392
|
-
};
|
|
2393
|
-
};
|
|
2394
|
-
/**
|
|
2395
|
-
* Handles page boundary breakpoint (empty pattern).
|
|
2396
|
-
* Returns break position or -1 if no valid position found.
|
|
2397
|
-
*/
|
|
2398
|
-
const findStartOfNextPageInWindow = (remainingContent, currentFromIdx, toIdx, pageIds, normalizedPages, targetPos) => {
|
|
2399
|
-
const targetNextPageIdx = currentFromIdx + 1;
|
|
2400
|
-
for (let nextIdx = targetNextPageIdx; nextIdx > currentFromIdx; nextIdx--) if (nextIdx <= toIdx) {
|
|
2401
|
-
const nextPageData = normalizedPages.get(pageIds[nextIdx]);
|
|
2402
|
-
if (nextPageData) {
|
|
2403
|
-
const boundaryPos = findNextPagePosition(remainingContent, nextPageData);
|
|
2404
|
-
if (boundaryPos > 0 && boundaryPos <= targetPos) return boundaryPos;
|
|
2405
|
-
}
|
|
2406
|
-
}
|
|
2407
|
-
return -1;
|
|
2408
|
-
};
|
|
2409
|
-
const handlePageBoundaryBreak = (remainingContent, currentFromIdx, windowEndPosition, maxContentLength, toIdx, pageIds, normalizedPages) => {
|
|
2410
|
-
const targetPos = Math.min(windowEndPosition, remainingContent.length);
|
|
2411
|
-
const isLengthBounded = maxContentLength !== void 0 && windowEndPosition === maxContentLength;
|
|
2412
|
-
if (!isLengthBounded) {
|
|
2413
|
-
const boundaryPos = findStartOfNextPageInWindow(remainingContent, currentFromIdx, toIdx, pageIds, normalizedPages, targetPos);
|
|
2414
|
-
if (boundaryPos > 0) return { pos: boundaryPos };
|
|
2415
|
-
}
|
|
2416
|
-
if (targetPos < remainingContent.length) {
|
|
2417
|
-
const safePos = findSafeBreakPosition(remainingContent, targetPos);
|
|
2418
|
-
if (safePos !== -1) return {
|
|
2419
|
-
pos: safePos,
|
|
2420
|
-
splitReason: isLengthBounded ? "whitespace" : void 0
|
|
2421
|
-
};
|
|
2422
|
-
return {
|
|
2423
|
-
pos: adjustForUnicodeBoundary(remainingContent, targetPos),
|
|
2424
|
-
splitReason: isLengthBounded ? "unicode_boundary" : void 0
|
|
2425
|
-
};
|
|
2426
|
-
}
|
|
2427
|
-
return { pos: targetPos };
|
|
2428
|
-
};
|
|
2429
|
-
const checkBreakpointMatch = (i, remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, ctx, maxContentLength) => {
|
|
2430
|
-
const { pageIds, normalizedPages, expandedBreakpoints, prefer } = ctx;
|
|
2431
|
-
const bpCtx = expandedBreakpoints[i];
|
|
2432
|
-
const { rule, regex, excludeSet, skipWhenRegex } = bpCtx;
|
|
2433
|
-
if (!isInBreakpointRange(pageIds[currentFromIdx], rule)) return null;
|
|
2434
|
-
if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) return null;
|
|
2435
|
-
if (skipWhenRegex?.test(remainingContent)) return null;
|
|
2436
|
-
if (regex === null) {
|
|
2437
|
-
const result = handlePageBoundaryBreak(remainingContent, currentFromIdx, windowEndPosition, maxContentLength, toIdx, pageIds, normalizedPages);
|
|
2438
|
-
return {
|
|
2439
|
-
breakPos: result.pos,
|
|
2440
|
-
breakpointIndex: i,
|
|
2441
|
-
contentLengthSplit: result.splitReason && maxContentLength ? {
|
|
2442
|
-
maxContentLength,
|
|
2443
|
-
reason: result.splitReason
|
|
2444
|
-
} : void 0,
|
|
2445
|
-
rule
|
|
2446
|
-
};
|
|
2447
|
-
}
|
|
2448
|
-
const { pos: breakPos, groups } = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer, bpCtx.splitAt);
|
|
2449
|
-
if (breakPos > 0) return {
|
|
2450
|
-
breakPos,
|
|
2451
|
-
breakpointIndex: i,
|
|
2452
|
-
rule,
|
|
2453
|
-
wordIndex: extractDebugIndex(groups, "_w")
|
|
2454
|
-
};
|
|
2455
|
-
return null;
|
|
2456
|
-
};
|
|
2457
|
-
/**
|
|
2458
|
-
* Tries to find a break position within the current window using breakpoint patterns.
|
|
2459
|
-
* Returns the break position or -1 if no suitable break was found.
|
|
2460
|
-
*
|
|
2461
|
-
* @param remainingContent - Content remaining to be segmented
|
|
2462
|
-
* @param currentFromIdx - Current starting page index
|
|
2463
|
-
* @param toIdx - Ending page index
|
|
2464
|
-
* @param windowEndIdx - Maximum window end index
|
|
2465
|
-
* @param ctx - Breakpoint context with page data and patterns
|
|
2466
|
-
* @returns Break position in the content, or -1 if no break found
|
|
2467
|
-
*/
|
|
2468
|
-
const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, ctx, maxContentLength) => {
|
|
2469
|
-
const { expandedBreakpoints } = ctx;
|
|
2470
|
-
for (let i = 0; i < expandedBreakpoints.length; i++) {
|
|
2471
|
-
const match = checkBreakpointMatch(i, remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, ctx, maxContentLength);
|
|
2472
|
-
if (match) return match;
|
|
2473
|
-
}
|
|
2474
|
-
return null;
|
|
2475
|
-
};
|
|
2476
|
-
/**
|
|
2477
|
-
* Searches backward from a target position to find a "safe" split point.
|
|
2478
|
-
* A safe split point is after whitespace or punctuation.
|
|
2479
|
-
*
|
|
2480
|
-
* @param content The text content
|
|
2481
|
-
* @param targetPosition The desired split position (hard limit)
|
|
2482
|
-
* @param lookbackChars How far back to search for a safe break
|
|
2483
|
-
* @returns The new split position (index), or -1 if no safe break found
|
|
2484
|
-
*/
|
|
2485
|
-
const findSafeBreakPosition = (content, targetPosition, lookbackChars = 100) => {
|
|
2486
|
-
const startSearch = Math.max(0, targetPosition - lookbackChars);
|
|
2487
|
-
for (let i = targetPosition - 1; i >= startSearch; i--) {
|
|
2488
|
-
const char = content[i];
|
|
2489
|
-
if (STOP_CHARACTERS.test(char)) return i + 1;
|
|
2490
|
-
}
|
|
2491
|
-
return -1;
|
|
2492
|
-
};
|
|
2493
|
-
//#endregion
|
|
2494
|
-
//#region src/segmentation/debug-meta.ts
|
|
2495
|
-
const resolveDebugConfig = (debug) => {
|
|
2496
|
-
if (debug === true) return {
|
|
2497
|
-
includeBreakpoint: true,
|
|
2498
|
-
includeRule: true,
|
|
2499
|
-
metaKey: "_flappa"
|
|
2500
|
-
};
|
|
2501
|
-
if (!debug || typeof debug !== "object") return null;
|
|
2502
|
-
const { metaKey, include } = debug;
|
|
2503
|
-
const includeRule = Array.isArray(include) ? include.includes("rule") : true;
|
|
2504
|
-
return {
|
|
2505
|
-
includeBreakpoint: Array.isArray(include) ? include.includes("breakpoint") : true,
|
|
2506
|
-
includeRule,
|
|
2507
|
-
metaKey: typeof metaKey === "string" && metaKey ? metaKey : "_flappa"
|
|
2508
|
-
};
|
|
2509
|
-
};
|
|
2510
|
-
const getRulePatternType = (rule) => {
|
|
2511
|
-
return PATTERN_TYPE_KEYS.find((key) => key in rule) ?? "regex";
|
|
2512
|
-
};
|
|
2513
|
-
const isPlainObject = (v) => Boolean(v) && typeof v === "object" && !Array.isArray(v);
|
|
2514
|
-
const mergeDebugIntoMeta = (meta, metaKey, patch) => {
|
|
2515
|
-
const out = meta ? { ...meta } : {};
|
|
2516
|
-
const existing = out[metaKey];
|
|
2517
|
-
out[metaKey] = {
|
|
2518
|
-
...isPlainObject(existing) ? existing : {},
|
|
2519
|
-
...patch
|
|
2520
|
-
};
|
|
2521
|
-
return out;
|
|
2522
|
-
};
|
|
2523
|
-
const buildRuleDebugPatch = (ruleIndex, rule, wordIndex) => {
|
|
2524
|
-
const patternType = getRulePatternType(rule);
|
|
2525
|
-
const patterns = rule[patternType];
|
|
2526
|
-
const word = wordIndex !== void 0 && Array.isArray(patterns) && patterns[wordIndex] !== void 0 ? patterns[wordIndex] : void 0;
|
|
2527
|
-
return { rule: {
|
|
2528
|
-
index: ruleIndex,
|
|
2529
|
-
patternType,
|
|
2530
|
-
...wordIndex !== void 0 ? { wordIndex } : {},
|
|
2531
|
-
...word !== void 0 ? { word } : {}
|
|
2532
|
-
} };
|
|
2533
|
-
};
|
|
2534
|
-
const buildBreakpointDebugPatch = (breakpointIndex, rule, wordIndex) => ({ breakpoint: {
|
|
2535
|
-
index: breakpointIndex,
|
|
2536
|
-
kind: rule.pattern === "" ? "pageBoundary" : rule.regex ? "regex" : "pattern",
|
|
2537
|
-
pattern: rule.pattern ?? rule.regex,
|
|
2538
|
-
...wordIndex !== void 0 ? { wordIndex } : {},
|
|
2539
|
-
...wordIndex !== void 0 && rule.words ? { word: rule.words[wordIndex] } : {}
|
|
2540
|
-
} });
|
|
2541
|
-
/**
|
|
2542
|
-
* Helper to format the debug info into a human-readable string.
|
|
2543
|
-
* @param meta - The segment metadata object
|
|
2544
|
-
* @param options - Formatting options
|
|
2545
|
-
*/
|
|
2546
|
-
const formatRuleReason = (rule, concise) => {
|
|
2547
|
-
const { index, patternType, wordIndex, word } = rule;
|
|
2548
|
-
if (concise) return `Rule: ${word ? `"${word}"` : patternType}`;
|
|
2549
|
-
const wordInfo = word ? ` (Matched: "${word}")` : "";
|
|
2550
|
-
return `Rule #${index} (${patternType})${wordIndex !== void 0 ? ` [idx:${wordIndex}]` : ""}${wordInfo}`;
|
|
2551
|
-
};
|
|
2552
|
-
const formatBreakpointReason = (breakpoint, concise) => {
|
|
2553
|
-
const { index, kind, pattern, wordIndex, word } = breakpoint;
|
|
2554
|
-
if (kind === "pageBoundary") return concise ? "Breakpoint: <page-boundary>" : "Page Boundary (Fallback)";
|
|
2555
|
-
if (concise) return `Breakpoint: ${word ? `"${word}"` : `"${pattern}"`}`;
|
|
2556
|
-
if (word) return `Breakpoint #${index} (Words) [idx:${wordIndex}] - "${word}"`;
|
|
2557
|
-
return `Breakpoint #${index} (${kind}) - "${pattern}"`;
|
|
2558
|
-
};
|
|
2559
|
-
const formatContentLengthReason = (split, concise) => {
|
|
2560
|
-
const { maxContentLength, splitReason } = split;
|
|
2561
|
-
if (concise) return `> ${maxContentLength} (${splitReason})`;
|
|
2562
|
-
return `Safety Split (${splitReason}) > ${maxContentLength}`;
|
|
2563
|
-
};
|
|
2564
|
-
/**
|
|
2565
|
-
* Helper to format the debug info into a human-readable string.
|
|
2566
|
-
* @param meta - The segment metadata object
|
|
2567
|
-
* @param options - Formatting options
|
|
2568
|
-
*/
|
|
2569
|
-
const getDebugReason = (meta, options) => {
|
|
2570
|
-
const debug = meta?._flappa;
|
|
2571
|
-
if (!debug) return "-";
|
|
2572
|
-
const concise = options?.concise;
|
|
2573
|
-
if (debug.rule) return formatRuleReason(debug.rule, concise);
|
|
2574
|
-
if (debug.breakpoint) return formatBreakpointReason(debug.breakpoint, concise);
|
|
2575
|
-
if (debug.contentLengthSplit) return formatContentLengthReason(debug.contentLengthSplit, concise);
|
|
2576
|
-
return "Unknown";
|
|
2577
|
-
};
|
|
2578
|
-
/**
|
|
2579
|
-
* Convenience helper to get the formatted debug reason directly from a segment.
|
|
2580
|
-
* @param segment - The segment object
|
|
2581
|
-
* @param options - Formatting options
|
|
2582
|
-
*/
|
|
2583
|
-
const getSegmentDebugReason = (segment, options) => {
|
|
2584
|
-
return getDebugReason(segment.meta, options);
|
|
2585
|
-
};
|
|
2586
|
-
//#endregion
|
|
2587
|
-
//#region src/segmentation/pattern-validator.ts
|
|
2588
|
-
const KNOWN_TOKENS = new Set(getAvailableTokens());
|
|
2589
|
-
const TOKEN_INSIDE_BRACES = /\{\{(\w+)(?::\w+)?\}\}/g;
|
|
2590
|
-
const buildBareTokenRegex = () => {
|
|
2591
|
-
const tokens = [...KNOWN_TOKENS].sort((a, b) => b.length - a.length);
|
|
2592
|
-
return new RegExp(`(?<!\\{\\{)(${tokens.join("|")})(?::\\w+)?(?!\\}\\})`, "g");
|
|
2593
|
-
};
|
|
2594
|
-
/**
|
|
2595
|
-
* Validates a single pattern for common issues.
|
|
2596
|
-
*/
|
|
2597
|
-
const validatePattern = (pattern, seenPatterns) => {
|
|
2598
|
-
if (!pattern.trim()) return {
|
|
2599
|
-
message: "Empty pattern is not allowed",
|
|
2600
|
-
type: "empty_pattern"
|
|
2601
|
-
};
|
|
2602
|
-
if (seenPatterns.has(pattern)) return {
|
|
2603
|
-
message: `Duplicate pattern: "${pattern}"`,
|
|
2604
|
-
pattern,
|
|
2605
|
-
type: "duplicate"
|
|
2606
|
-
};
|
|
2607
|
-
seenPatterns.add(pattern);
|
|
2608
|
-
TOKEN_INSIDE_BRACES.lastIndex = 0;
|
|
2609
|
-
for (const match of pattern.matchAll(TOKEN_INSIDE_BRACES)) {
|
|
2610
|
-
const name = match[1];
|
|
2611
|
-
if (!KNOWN_TOKENS.has(name)) return {
|
|
2612
|
-
message: `Unknown token: {{${name}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
|
|
2613
|
-
suggestion: "Check spelling or use a known token",
|
|
2614
|
-
token: name,
|
|
2615
|
-
type: "unknown_token"
|
|
2616
|
-
};
|
|
2617
|
-
}
|
|
2618
|
-
for (const match of pattern.matchAll(buildBareTokenRegex())) {
|
|
2619
|
-
const [full, name] = match;
|
|
2620
|
-
const idx = match.index;
|
|
2621
|
-
if (pattern.slice(Math.max(0, idx - 2), idx) !== "{{" || pattern.slice(idx + full.length, idx + full.length + 2) !== "}}") return {
|
|
2622
|
-
message: `Token "${name}" appears to be missing {{}}. Did you mean "{{${full}}}"?`,
|
|
2623
|
-
suggestion: `{{${full}}}`,
|
|
2624
|
-
token: name,
|
|
2625
|
-
type: "missing_braces"
|
|
2626
|
-
};
|
|
2627
|
-
}
|
|
2628
|
-
};
|
|
2629
|
-
/**
|
|
2630
|
-
* Validates an array of patterns, returning parallel array of issues.
|
|
2631
|
-
*/
|
|
2632
|
-
const validatePatternArray = (patterns) => {
|
|
2633
|
-
const seen = /* @__PURE__ */ new Set();
|
|
2634
|
-
const issues = patterns.map((p) => validatePattern(p, seen));
|
|
2635
|
-
return issues.some(Boolean) ? issues : void 0;
|
|
2636
|
-
};
|
|
2637
|
-
const applyRulePatternValidation = (result, key, patterns) => {
|
|
2638
|
-
if (!patterns) return false;
|
|
2639
|
-
const issues = validatePatternArray(patterns);
|
|
2640
|
-
if (!issues) return false;
|
|
2641
|
-
result[key] = issues;
|
|
2642
|
-
return true;
|
|
2643
|
-
};
|
|
2644
|
-
const validateTemplateRule = (rule, result) => {
|
|
2645
|
-
if (rule.template === void 0) return false;
|
|
2646
|
-
const issue = validatePattern(rule.template, /* @__PURE__ */ new Set());
|
|
2647
|
-
if (!issue) return false;
|
|
2648
|
-
result.template = issue;
|
|
2649
|
-
return true;
|
|
2650
|
-
};
|
|
2651
|
-
const validateRegexRule = (rule, result) => {
|
|
2652
|
-
if (rule.regex === void 0) return false;
|
|
2653
|
-
if (!rule.regex.trim()) {
|
|
2654
|
-
result.regex = {
|
|
2655
|
-
message: "Empty pattern is not allowed",
|
|
2656
|
-
type: "empty_pattern"
|
|
2657
|
-
};
|
|
2658
|
-
return true;
|
|
2659
|
-
}
|
|
2660
|
-
try {
|
|
2661
|
-
new RegExp(rule.regex, "u");
|
|
2662
|
-
return false;
|
|
2663
|
-
} catch (error) {
|
|
2664
|
-
result.regex = {
|
|
2665
|
-
message: error instanceof Error ? error.message : String(error),
|
|
2666
|
-
pattern: rule.regex,
|
|
2667
|
-
type: "invalid_regex"
|
|
2668
|
-
};
|
|
2669
|
-
return true;
|
|
2670
|
-
}
|
|
2671
|
-
};
|
|
2672
|
-
const invalidDictionaryEntryIssue = (message) => ({
|
|
2673
|
-
message,
|
|
2674
|
-
type: "invalid_option"
|
|
2675
|
-
});
|
|
2676
|
-
const validateDictionaryEntryRule = (rule, result) => {
|
|
2677
|
-
if (!("dictionaryEntry" in rule) || !rule.dictionaryEntry) return false;
|
|
2678
|
-
const issues = {};
|
|
2679
|
-
const { allowCommaSeparated, allowParenthesized, allowWhitespaceBeforeColon, captureName, maxLetters, midLineSubentries, minLetters, stopWords } = rule.dictionaryEntry;
|
|
2680
|
-
if (!Array.isArray(stopWords) || stopWords.some((word) => typeof word !== "string" || !word.trim())) issues.stopWords = invalidDictionaryEntryIssue("stopWords must be a string[] with non-empty entries");
|
|
2681
|
-
if (allowCommaSeparated !== void 0 && typeof allowCommaSeparated !== "boolean") issues.allowCommaSeparated = invalidDictionaryEntryIssue("allowCommaSeparated must be a boolean");
|
|
2682
|
-
if (allowParenthesized !== void 0 && typeof allowParenthesized !== "boolean") issues.allowParenthesized = invalidDictionaryEntryIssue("allowParenthesized must be a boolean");
|
|
2683
|
-
if (allowWhitespaceBeforeColon !== void 0 && typeof allowWhitespaceBeforeColon !== "boolean") issues.allowWhitespaceBeforeColon = invalidDictionaryEntryIssue("allowWhitespaceBeforeColon must be a boolean");
|
|
2684
|
-
if (midLineSubentries !== void 0 && typeof midLineSubentries !== "boolean") issues.midLineSubentries = invalidDictionaryEntryIssue("midLineSubentries must be a boolean");
|
|
2685
|
-
if (captureName !== void 0 && !captureName.match(/^[A-Za-z_]\w*$/)) issues.captureName = invalidDictionaryEntryIssue(`captureName must match /^[A-Za-z_]\\w*$/, got "${captureName}"`);
|
|
2686
|
-
if (minLetters !== void 0 && (!Number.isInteger(minLetters) || minLetters < 1)) issues.minLetters = invalidDictionaryEntryIssue("minLetters must be an integer >= 1");
|
|
2687
|
-
if (maxLetters !== void 0 && (!Number.isInteger(maxLetters) || maxLetters < (minLetters ?? 2))) issues.maxLetters = invalidDictionaryEntryIssue(`maxLetters must be an integer >= ${minLetters ?? 2}`);
|
|
2688
|
-
if (Object.keys(issues).length === 0) return false;
|
|
2689
|
-
result.dictionaryEntry = issues;
|
|
2690
|
-
return true;
|
|
2691
|
-
};
|
|
2692
|
-
const formatValidationIssue = (_type, issue, loc) => {
|
|
2693
|
-
if (!issue) return null;
|
|
2694
|
-
if (issue.type === "missing_braces") return `${loc}: Missing {{}} around token "${issue.token}"`;
|
|
2695
|
-
if (issue.type === "unknown_token") return `${loc}: Unknown token "{{${issue.token}}}"`;
|
|
2696
|
-
if (issue.type === "duplicate") return `${loc}: Duplicate pattern "${issue.pattern}"`;
|
|
2697
|
-
if (issue.type === "invalid_regex") return `${loc}: Invalid regex (${issue.message})`;
|
|
2698
|
-
return `${loc}: ${issue.message || issue.type}`;
|
|
2699
|
-
};
|
|
2700
|
-
/**
|
|
2701
|
-
* Validates split rules for common pattern issues.
|
|
2702
|
-
*
|
|
2703
|
-
* Checks for:
|
|
2704
|
-
* - Missing `{{}}` around known token names (e.g., `raqms:num` instead of `{{raqms:num}}`)
|
|
2705
|
-
* - Unknown token names inside `{{}}` (e.g., `{{nonexistent}}`)
|
|
2706
|
-
* - Duplicate patterns within the same rule
|
|
2707
|
-
*
|
|
2708
|
-
* @param rules - Array of split rules to validate
|
|
2709
|
-
* @returns Array parallel to input with validation results (undefined if no issues)
|
|
2710
|
-
*
|
|
2711
|
-
* @example
|
|
2712
|
-
* const issues = validateRules([
|
|
2713
|
-
* { lineStartsAfter: ['raqms:num'] }, // Missing braces
|
|
2714
|
-
* { lineStartsWith: ['{{unknown}}'] }, // Unknown token
|
|
2715
|
-
* ]);
|
|
2716
|
-
* // issues[0]?.lineStartsAfter?.[0]?.type === 'missing_braces'
|
|
2717
|
-
* // issues[1]?.lineStartsWith?.[0]?.type === 'unknown_token'
|
|
2718
|
-
*/
|
|
2719
|
-
const validateRules = (rules) => rules.map((rule) => {
|
|
2720
|
-
const result = {};
|
|
2721
|
-
const startsWithIssues = applyRulePatternValidation(result, "lineStartsWith", rule.lineStartsWith);
|
|
2722
|
-
const startsAfterIssues = applyRulePatternValidation(result, "lineStartsAfter", rule.lineStartsAfter);
|
|
2723
|
-
const endsWithIssues = applyRulePatternValidation(result, "lineEndsWith", rule.lineEndsWith);
|
|
2724
|
-
const templateIssues = validateTemplateRule(rule, result);
|
|
2725
|
-
const regexIssues = validateRegexRule(rule, result);
|
|
2726
|
-
const dictionaryEntryIssues = validateDictionaryEntryRule(rule, result);
|
|
2727
|
-
return startsWithIssues || startsAfterIssues || endsWithIssues || templateIssues || regexIssues || dictionaryEntryIssues ? result : void 0;
|
|
2728
|
-
});
|
|
2729
|
-
/**
|
|
2730
|
-
* Formats a validation result array into a list of human-readable error messages.
|
|
2731
|
-
*
|
|
2732
|
-
* Useful for displaying validation errors in UIs.
|
|
2733
|
-
*
|
|
2734
|
-
* @param results - The result array from `validateRules()`
|
|
2735
|
-
* @returns Array of formatted error strings
|
|
2736
|
-
*
|
|
2737
|
-
* @example
|
|
2738
|
-
* const issues = validateRules(rules);
|
|
2739
|
-
* const errors = formatValidationReport(issues);
|
|
2740
|
-
* // ["Rule 1, lineStartsWith: Missing {{}} around token..."]
|
|
2741
|
-
*/
|
|
2742
|
-
const formatValidationReport = (results) => results.flatMap((result, i) => {
|
|
2743
|
-
if (!result) return [];
|
|
2744
|
-
return Object.entries(result).flatMap(([type, issues]) => formatValidationIssues(type, issues, i + 1));
|
|
2745
|
-
});
|
|
2746
|
-
const formatValidationIssues = (type, issues, ruleNumber) => {
|
|
2747
|
-
if (type === "dictionaryEntry" && issues && typeof issues === "object" && !Array.isArray(issues)) return Object.entries(issues).map(([field, issue]) => formatValidationIssue(type, issue, `Rule ${ruleNumber}, ${type}.${field}`)).filter((msg) => msg !== null);
|
|
2748
|
-
return (Array.isArray(issues) ? issues : [issues]).map((issue) => formatValidationIssue(type, issue, `Rule ${ruleNumber}, ${type}`)).filter((msg) => msg !== null);
|
|
2749
|
-
};
|
|
2750
|
-
//#endregion
|
|
2751
|
-
//#region src/segmentation/breakpoint-processor.ts
|
|
2752
|
-
const buildPageIdToIndexMap = (pageIds) => new Map(pageIds.map((id, i) => [id, i]));
|
|
2753
|
-
const buildNormalizedPagesMap = (pages, normalizedContent) => {
|
|
2754
|
-
const normalizedPages = /* @__PURE__ */ new Map();
|
|
2755
|
-
for (let i = 0; i < pages.length; i++) {
|
|
2756
|
-
const content = normalizedContent[i];
|
|
2757
|
-
normalizedPages.set(pages[i].id, {
|
|
2758
|
-
content,
|
|
2759
|
-
index: i,
|
|
2760
|
-
length: content.length
|
|
2761
|
-
});
|
|
2762
|
-
}
|
|
2763
|
-
return normalizedPages;
|
|
2764
|
-
};
|
|
2765
|
-
const buildCumulativeOffsets = (pageIds, normalizedPages) => {
|
|
2766
|
-
const cumulativeOffsets = [0];
|
|
2767
|
-
let totalOffset = 0;
|
|
2768
|
-
for (let i = 0; i < pageIds.length; i++) {
|
|
2769
|
-
const pageData = normalizedPages.get(pageIds[i]);
|
|
2770
|
-
totalOffset += pageData?.length ?? 0;
|
|
2771
|
-
if (i < pageIds.length - 1) totalOffset += 1;
|
|
2772
|
-
cumulativeOffsets.push(totalOffset);
|
|
2773
|
-
}
|
|
2774
|
-
return cumulativeOffsets;
|
|
2775
|
-
};
|
|
2776
|
-
const hasAnyExclusionsInRange = (expandedBreakpoints, pageIds, fromIdx, toIdx) => expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, fromIdx, toIdx));
|
|
2777
|
-
const computeWindowEndIdx = (currentFromIdx, toIdx, pageIds, maxPages) => {
|
|
2778
|
-
const maxWindowPageId = pageIds[currentFromIdx] + maxPages;
|
|
2779
|
-
let windowEndIdx = currentFromIdx;
|
|
2780
|
-
for (let i = currentFromIdx; i <= toIdx; i++) if (pageIds[i] <= maxWindowPageId) windowEndIdx = i;
|
|
2781
|
-
else break;
|
|
2782
|
-
return windowEndIdx;
|
|
2783
|
-
};
|
|
2784
|
-
const computeRemainingSpan = (currentFromIdx, toIdx, pageIds) => pageIds[toIdx] - pageIds[currentFromIdx];
|
|
2785
|
-
const createFinalSegment = (remainingContent, currentFromIdx, toIdx, pageIds, meta, includeMeta) => createSegment(remainingContent, pageIds[currentFromIdx], currentFromIdx !== toIdx ? pageIds[toIdx] : void 0, includeMeta ? meta : void 0);
|
|
2786
|
-
/**
|
|
2787
|
-
* Computes the actual start and end page indices for a piece using
|
|
2788
|
-
* precomputed boundary positions and binary search.
|
|
2789
|
-
*
|
|
2790
|
-
* @param pieceStartPos - Start position of the piece in the full segment content
|
|
2791
|
-
* @param pieceEndPos - End position (exclusive) of the piece
|
|
2792
|
-
* @param boundaryPositions - Precomputed boundary positions from buildBoundaryPositions
|
|
2793
|
-
* @param baseFromIdx - Base page index (boundaryPositions[0] corresponds to pageIds[baseFromIdx])
|
|
2794
|
-
* @param toIdx - Maximum page index
|
|
2795
|
-
* @returns Object with actualStartIdx and actualEndIdx
|
|
2796
|
-
*/
|
|
2797
|
-
const computePiecePages = (pieceStartPos, pieceEndPos, boundaryPositions, baseFromIdx, toIdx) => {
|
|
2798
|
-
const actualStartIdx = findPageIndexForPosition(pieceStartPos, boundaryPositions, baseFromIdx);
|
|
2799
|
-
const endPos = Math.max(pieceStartPos, pieceEndPos - 1);
|
|
2800
|
-
return {
|
|
2801
|
-
actualEndIdx: Math.min(findPageIndexForPosition(endPos, boundaryPositions, baseFromIdx), toIdx),
|
|
2802
|
-
actualStartIdx
|
|
2803
|
-
};
|
|
2804
|
-
};
|
|
2805
|
-
const computeNextFromIdx = (remainingContent, actualEndIdx, toIdx, pageIds, normalizedPages) => {
|
|
2806
|
-
let nextFromIdx = actualEndIdx;
|
|
2807
|
-
if (remainingContent && actualEndIdx + 1 <= toIdx) {
|
|
2808
|
-
const nextPageData = normalizedPages.get(pageIds[actualEndIdx + 1]);
|
|
2809
|
-
if (nextPageData) {
|
|
2810
|
-
const nextPrefix = nextPageData.content.slice(0, 30);
|
|
2811
|
-
const remainingPrefix = remainingContent.trimStart().slice(0, 30);
|
|
2812
|
-
if (nextPrefix && (remainingContent.startsWith(nextPrefix) || nextPageData.content.startsWith(remainingPrefix))) nextFromIdx = actualEndIdx + 1;
|
|
2813
|
-
}
|
|
2814
|
-
}
|
|
2815
|
-
return nextFromIdx;
|
|
2816
|
-
};
|
|
2817
|
-
const createPieceSegment = (pieceContent, actualStartIdx, actualEndIdx, pageIds, meta, includeMeta) => createSegment(pieceContent, pageIds[actualStartIdx], actualEndIdx > actualStartIdx ? pageIds[actualEndIdx] : void 0, includeMeta ? meta : void 0);
|
|
2818
|
-
/**
|
|
2819
|
-
* Finds the break offset within a window, trying exclusions first, then patterns.
|
|
2820
|
-
*
|
|
2821
|
-
* @returns Break offset relative to remainingContent, or windowEndPosition as fallback
|
|
2822
|
-
*/
|
|
2823
|
-
const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer, maxContentLength) => {
|
|
2824
|
-
if (hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx)) {
|
|
2825
|
-
const exclusionBreak = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
|
|
2826
|
-
if (exclusionBreak > 0) return { breakOffset: exclusionBreak };
|
|
2827
|
-
}
|
|
2828
|
-
const patternMatch = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
|
|
2829
|
-
expandedBreakpoints,
|
|
2830
|
-
normalizedPages,
|
|
2831
|
-
pageIds,
|
|
2832
|
-
prefer
|
|
2833
|
-
}, maxContentLength);
|
|
2834
|
-
if (patternMatch && patternMatch.breakPos > 0) return {
|
|
2835
|
-
breakOffset: patternMatch.breakPos,
|
|
2836
|
-
breakpointIndex: patternMatch.breakpointIndex,
|
|
2837
|
-
breakpointRule: patternMatch.rule,
|
|
2838
|
-
contentLengthSplit: patternMatch.contentLengthSplit,
|
|
2839
|
-
wordIndex: patternMatch.wordIndex
|
|
2840
|
-
};
|
|
2841
|
-
if (windowEndPosition < remainingContent.length) {
|
|
2842
|
-
const safeOffset = findSafeBreakPosition(remainingContent, windowEndPosition);
|
|
2843
|
-
if (safeOffset !== -1) return {
|
|
2844
|
-
breakOffset: safeOffset,
|
|
2845
|
-
contentLengthSplit: maxContentLength ? {
|
|
2846
|
-
maxContentLength,
|
|
2847
|
-
reason: "whitespace"
|
|
2848
|
-
} : void 0
|
|
2849
|
-
};
|
|
2850
|
-
return {
|
|
2851
|
-
breakOffset: adjustForUnicodeBoundary(remainingContent, windowEndPosition),
|
|
2852
|
-
contentLengthSplit: maxContentLength ? {
|
|
2853
|
-
maxContentLength,
|
|
2854
|
-
reason: "unicode_boundary"
|
|
2855
|
-
} : void 0
|
|
2856
|
-
};
|
|
2857
|
-
}
|
|
2858
|
-
return { breakOffset: windowEndPosition };
|
|
2859
|
-
};
|
|
2860
|
-
/**
|
|
2861
|
-
* Advances cursor position past any leading whitespace.
|
|
2862
|
-
*/
|
|
2863
|
-
const skipWhitespace = (content, startPos) => {
|
|
2864
|
-
let pos = startPos;
|
|
2865
|
-
while (pos < content.length && /\s/.test(content[pos])) pos++;
|
|
2866
|
-
return pos;
|
|
2867
|
-
};
|
|
2868
|
-
/**
|
|
2869
|
-
* Validates that cumulative offsets match actual content length within a tolerance.
|
|
2870
|
-
* Required to detect if structural rules (like `lineStartsAfter`) have stripped content
|
|
2871
|
-
* which would make offset-based calculations inaccurate.
|
|
2872
|
-
*/
|
|
2873
|
-
const checkFastPathAlignment = (cumulativeOffsets, fullContent, fromIdx, toIdx, pageCount, logger) => {
|
|
2874
|
-
const expectedLength = (cumulativeOffsets[toIdx + 1] ?? fullContent.length) - (cumulativeOffsets[fromIdx] ?? 0);
|
|
2875
|
-
const driftTolerance = Math.max(100, fullContent.length * .01);
|
|
2876
|
-
const isAligned = Math.abs(expectedLength - fullContent.length) <= driftTolerance;
|
|
2877
|
-
if (!isAligned && pageCount >= 1e3) logger?.warn?.("[breakpoints] Offset drift detected in fast-path candidate, falling back to slow path", {
|
|
2878
|
-
actualLength: fullContent.length,
|
|
2879
|
-
drift: Math.abs(expectedLength - fullContent.length),
|
|
2880
|
-
expectedLength,
|
|
2881
|
-
pageCount
|
|
2882
|
-
});
|
|
2883
|
-
return isAligned;
|
|
2884
|
-
};
|
|
2885
|
-
/**
|
|
2886
|
-
* Handles the special optimized case for maxPages=0 (1 page per segment).
|
|
2887
|
-
* This is O(n) and safer than offset arithmetic as it uses source pages directly.
|
|
2888
|
-
*/
|
|
2889
|
-
const processTrivialFastPath = (fromIdx, toIdx, pageIds, normalizedPages, pageCount, originalMeta, debugMetaKey, logger) => {
|
|
2890
|
-
logger?.debug?.("[breakpoints] Using trivial per-page fast-path (maxPages=0)", {
|
|
2891
|
-
fromIdx,
|
|
2892
|
-
pageCount,
|
|
2893
|
-
toIdx
|
|
2894
|
-
});
|
|
2895
|
-
const result = [];
|
|
2896
|
-
for (let i = fromIdx; i <= toIdx; i++) {
|
|
2897
|
-
const pageData = normalizedPages.get(pageIds[i]);
|
|
2898
|
-
if (pageData?.content.trim()) {
|
|
2899
|
-
const meta = getSegmentMetaWithDebug(i === fromIdx, debugMetaKey, originalMeta, null);
|
|
2900
|
-
const seg = createSegment(pageData.content.trim(), pageIds[i], void 0, meta);
|
|
2901
|
-
if (seg) result.push(seg);
|
|
2902
|
-
}
|
|
2903
|
-
}
|
|
2904
|
-
return result;
|
|
2905
|
-
};
|
|
2906
|
-
/**
|
|
2907
|
-
* Handles fast-path segmentation for maxPages > 0 using cumulative offsets.
|
|
2908
|
-
* Avoids O(n²) string searching but requires accurate offsets.
|
|
2909
|
-
*/
|
|
2910
|
-
const buildFastPathRawContent = (fullContent, baseOffset, cumulativeOffsets, segStart, segEnd, toIdx) => {
|
|
2911
|
-
const startOffset = Math.max(0, (cumulativeOffsets[segStart] ?? 0) - baseOffset);
|
|
2912
|
-
const endOffset = segEnd < toIdx ? Math.max(0, (cumulativeOffsets[segEnd + 1] ?? fullContent.length) - baseOffset) : fullContent.length;
|
|
2913
|
-
return fullContent.slice(startOffset, endOffset).trim();
|
|
2914
|
-
};
|
|
2915
|
-
const buildFastPathSegment = (fullContent, baseOffset, cumulativeOffsets, segStart, segEnd, fromIdx, toIdx, pageIds, originalMeta, debugMetaKey) => {
|
|
2916
|
-
const rawContent = buildFastPathRawContent(fullContent, baseOffset, cumulativeOffsets, segStart, segEnd, toIdx);
|
|
2917
|
-
if (!rawContent) return null;
|
|
2918
|
-
const meta = getSegmentMetaWithDebug(segStart === fromIdx, debugMetaKey, originalMeta, null);
|
|
2919
|
-
const seg = {
|
|
2920
|
-
content: rawContent,
|
|
2921
|
-
from: pageIds[segStart]
|
|
2922
|
-
};
|
|
2923
|
-
if (segEnd > segStart) seg.to = pageIds[segEnd];
|
|
2924
|
-
if (meta) seg.meta = meta;
|
|
2925
|
-
return seg;
|
|
2926
|
-
};
|
|
2927
|
-
const processOffsetFastPath = (fullContent, fromIdx, toIdx, pageIds, cumulativeOffsets, maxPages, originalMeta, debugMetaKey, logger) => {
|
|
2928
|
-
const result = [];
|
|
2929
|
-
const pageCount = toIdx - fromIdx + 1;
|
|
2930
|
-
logger?.debug?.("[breakpoints] Using offset-based fast-path for large segment", {
|
|
2931
|
-
fromIdx,
|
|
2932
|
-
maxPages,
|
|
2933
|
-
pageCount,
|
|
2934
|
-
toIdx
|
|
2935
|
-
});
|
|
2936
|
-
const baseOffset = cumulativeOffsets[fromIdx] ?? 0;
|
|
2937
|
-
let segStart = fromIdx;
|
|
2938
|
-
const needsPeel = (startIdx) => pageIds[toIdx] - pageIds[startIdx] > maxPages;
|
|
2939
|
-
for (; segStart <= toIdx && needsPeel(segStart); segStart++) {
|
|
2940
|
-
const seg = buildFastPathSegment(fullContent, baseOffset, cumulativeOffsets, segStart, segStart, fromIdx, toIdx, pageIds, originalMeta, debugMetaKey);
|
|
2941
|
-
if (seg) result.push(seg);
|
|
2942
|
-
}
|
|
2943
|
-
if (segStart <= toIdx) {
|
|
2944
|
-
const seg = buildFastPathSegment(fullContent, baseOffset, cumulativeOffsets, segStart, toIdx, fromIdx, toIdx, pageIds, originalMeta, debugMetaKey);
|
|
2945
|
-
if (seg) result.push(seg);
|
|
2946
|
-
}
|
|
2947
|
-
return result;
|
|
2948
|
-
};
|
|
2949
|
-
/**
|
|
2950
|
-
* Checks if the remaining content fits within paged/length limits.
|
|
2951
|
-
* If so, pushes the final segment and returns true.
|
|
2952
|
-
*
|
|
2953
|
-
* @param actualRemainingEndIdx - The actual end page index of the remaining content
|
|
2954
|
-
* (computed from boundaryPositions), NOT the original segment's toIdx. This is critical
|
|
2955
|
-
* for maxPages=0 scenarios where remaining content may end before toIdx.
|
|
2956
|
-
*/
|
|
2957
|
-
const handleOversizedSegmentFit = (remainingContent, currentFromIdx, actualRemainingEndIdx, pageIds, expandedBreakpoints, maxPages, maxContentLength, isFirstPiece, debugMetaKey, originalMeta, lastBreakpoint, result) => {
|
|
2958
|
-
const remainingSpan = computeRemainingSpan(currentFromIdx, actualRemainingEndIdx, pageIds);
|
|
2959
|
-
const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, actualRemainingEndIdx);
|
|
2960
|
-
const fitsInPages = remainingSpan <= maxPages;
|
|
2961
|
-
const fitsInLength = !maxContentLength || remainingContent.length <= maxContentLength;
|
|
2962
|
-
if (fitsInPages && fitsInLength && !remainingHasExclusions) {
|
|
2963
|
-
const includeMeta = isFirstPiece || Boolean(debugMetaKey);
|
|
2964
|
-
const finalSeg = createFinalSegment(remainingContent, currentFromIdx, actualRemainingEndIdx, pageIds, getSegmentMetaWithDebug(isFirstPiece, debugMetaKey, originalMeta, lastBreakpoint), includeMeta);
|
|
2965
|
-
if (finalSeg) result.push(finalSeg);
|
|
2966
|
-
return true;
|
|
2967
|
-
}
|
|
2968
|
-
return false;
|
|
2969
|
-
};
|
|
2970
|
-
/**
|
|
2971
|
-
* Builds metadata for a segment piece, optionally including debug info.
|
|
2972
|
-
*/
|
|
2973
|
-
const getSegmentMetaWithDebug = (isFirstPiece, debugMetaKey, originalMeta, lastBreakpoint, contentLengthSplit) => {
|
|
2974
|
-
if (!(isFirstPiece || Boolean(debugMetaKey))) return;
|
|
2975
|
-
let meta = isFirstPiece ? originalMeta : void 0;
|
|
2976
|
-
if (debugMetaKey) {
|
|
2977
|
-
if (lastBreakpoint) meta = mergeDebugIntoMeta(meta, debugMetaKey, buildBreakpointDebugPatch(lastBreakpoint.breakpointIndex, lastBreakpoint.rule, lastBreakpoint.wordIndex));
|
|
2978
|
-
if (contentLengthSplit) meta = mergeDebugIntoMeta(meta, debugMetaKey, { contentLengthSplit: {
|
|
2979
|
-
maxContentLength: contentLengthSplit.maxContentLength,
|
|
2980
|
-
splitReason: contentLengthSplit.reason
|
|
2981
|
-
} });
|
|
2982
|
-
}
|
|
2983
|
-
return meta;
|
|
2984
|
-
};
|
|
2985
|
-
/**
|
|
2986
|
-
* Calculates window end position, capped by maxContentLength if present.
|
|
2987
|
-
*/
|
|
2988
|
-
const getWindowEndPosition = (remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, maxContentLength, logger) => {
|
|
2989
|
-
const pos = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger);
|
|
2990
|
-
return maxContentLength ? Math.min(pos, maxContentLength) : pos;
|
|
2991
|
-
};
|
|
2992
|
-
/**
|
|
2993
|
-
* Advances cursorPos and currentFromIdx for the next iteration.
|
|
2994
|
-
*/
|
|
2995
|
-
const advanceCursorAndIndex = (fullContent, breakPos, actualEndIdx, toIdx, pageIds, normalizedPages) => {
|
|
2996
|
-
const nextCursorPos = skipWhitespace(fullContent, breakPos);
|
|
2997
|
-
return {
|
|
2998
|
-
currentFromIdx: computeNextFromIdx(fullContent.slice(nextCursorPos, nextCursorPos + 500), actualEndIdx, toIdx, pageIds, normalizedPages),
|
|
2999
|
-
cursorPos: nextCursorPos
|
|
3000
|
-
};
|
|
3001
|
-
};
|
|
3002
|
-
const computeIterationWindow = (fullContent, cursorPos, currentFromIdx, fromIdx, toIdx, pageIds, boundaryPositions, maxPages, maxContentLength) => {
|
|
3003
|
-
const windowEndIdx = computeWindowEndIdx(currentFromIdx, toIdx, pageIds, maxPages);
|
|
3004
|
-
const windowEndAbsPos = boundaryPositions[windowEndIdx - fromIdx + 1] ?? fullContent.length;
|
|
3005
|
-
const sliceEndByPages = Math.min(fullContent.length, windowEndAbsPos + 4e3);
|
|
3006
|
-
const sliceEndByLength = maxContentLength ? Math.min(fullContent.length, cursorPos + maxContentLength + 4e3) : fullContent.length;
|
|
3007
|
-
const sliceEnd = Math.max(cursorPos + 1, Math.min(sliceEndByPages, sliceEndByLength));
|
|
3008
|
-
return {
|
|
3009
|
-
remainingContent: fullContent.slice(cursorPos, sliceEnd),
|
|
3010
|
-
sliceEnd,
|
|
3011
|
-
windowEndIdx
|
|
3012
|
-
};
|
|
3013
|
-
};
|
|
3014
|
-
const computeWindowEndPositionForIteration = (remainingContent, cursorPos, currentFromIdx, fromIdx, windowEndIdx, toIdx, pageIds, boundaryPositions, normalizedPages, cumulativeOffsets, maxPages, maxContentLength, logger) => {
|
|
3015
|
-
if (maxPages === 0) {
|
|
3016
|
-
const nextPageStartPos = boundaryPositions[currentFromIdx - fromIdx + 1] ?? Number.POSITIVE_INFINITY;
|
|
3017
|
-
const remainingInCurrentPage = Math.max(0, nextPageStartPos - cursorPos);
|
|
3018
|
-
return Math.min(maxContentLength ? Math.min(remainingInCurrentPage, maxContentLength) : remainingInCurrentPage, remainingContent.length);
|
|
3019
|
-
}
|
|
3020
|
-
const pos = getWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, maxContentLength, logger);
|
|
3021
|
-
return Math.min(pos, remainingContent.length);
|
|
3022
|
-
};
|
|
3023
|
-
const ensureProgressingBreakOffset = (foundBreakOffset, remainingContent, cursorPos, maxContentLength, logger) => {
|
|
3024
|
-
if (foundBreakOffset > 0) return foundBreakOffset;
|
|
3025
|
-
const fallbackPos = maxContentLength ? Math.min(maxContentLength, remainingContent.length) : 1;
|
|
3026
|
-
const breakOffset = Math.max(1, fallbackPos);
|
|
3027
|
-
logger?.warn?.("[breakpoints] No progress from findBreakOffsetForWindow; forcing forward movement", {
|
|
3028
|
-
breakOffset,
|
|
3029
|
-
cursorPos
|
|
3030
|
-
});
|
|
3031
|
-
return breakOffset;
|
|
3032
|
-
};
|
|
3033
|
-
const updateLastBreakpointFromFound = (found, lastBreakpoint) => {
|
|
3034
|
-
if (found.breakpointIndex !== void 0 && found.breakpointRule) return {
|
|
3035
|
-
breakpointIndex: found.breakpointIndex,
|
|
3036
|
-
rule: found.breakpointRule,
|
|
3037
|
-
wordIndex: found.wordIndex
|
|
3038
|
-
};
|
|
3039
|
-
return lastBreakpoint;
|
|
3040
|
-
};
|
|
3041
|
-
const appendPieceAndAdvance = (fullContent, cursorPos, breakPos, pieceContent, currentFromIdx, fromIdx, toIdx, pageIds, boundaryPositions, normalizedPages, maxPages, isFirstPiece, debugMetaKey, originalMeta, lastBreakpoint, result, logger, contentLengthSplit) => {
|
|
3042
|
-
let { actualEndIdx, actualStartIdx } = computePiecePages(cursorPos, breakPos, boundaryPositions, fromIdx, toIdx);
|
|
3043
|
-
if (actualStartIdx < currentFromIdx) {
|
|
3044
|
-
logger?.warn?.("[breakpoints] Page attribution drift detected; clamping actualStartIdx", {
|
|
3045
|
-
actualStartIdx,
|
|
3046
|
-
currentFromIdx
|
|
3047
|
-
});
|
|
3048
|
-
actualStartIdx = currentFromIdx;
|
|
3049
|
-
}
|
|
3050
|
-
if (maxPages === 0) {
|
|
3051
|
-
actualEndIdx = Math.min(actualEndIdx, currentFromIdx);
|
|
3052
|
-
actualStartIdx = Math.min(actualStartIdx, currentFromIdx);
|
|
3053
|
-
} else if (maxPages > 0) {
|
|
3054
|
-
const maxAllowedEndIdx = computeWindowEndIdx(actualStartIdx, toIdx, pageIds, maxPages);
|
|
3055
|
-
actualEndIdx = Math.min(actualEndIdx, maxAllowedEndIdx);
|
|
3056
|
-
}
|
|
3057
|
-
const meta = getSegmentMetaWithDebug(isFirstPiece, debugMetaKey, originalMeta, lastBreakpoint, contentLengthSplit);
|
|
3058
|
-
const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, meta, true);
|
|
3059
|
-
if (pieceSeg) result.push(pieceSeg);
|
|
3060
|
-
const next = advanceCursorAndIndex(fullContent, breakPos, actualEndIdx, toIdx, pageIds, normalizedPages);
|
|
3061
|
-
let nextFromIdx = next.currentFromIdx;
|
|
3062
|
-
if (maxPages === 0) nextFromIdx = findPageIndexForPosition(next.cursorPos, boundaryPositions, fromIdx);
|
|
3063
|
-
return {
|
|
3064
|
-
currentFromIdx: nextFromIdx,
|
|
3065
|
-
cursorPos: next.cursorPos
|
|
3066
|
-
};
|
|
3067
|
-
};
|
|
3068
|
-
const tryProcessOversizedSegmentFastPath = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, logger, debugMetaKey, maxContentLength) => {
|
|
3069
|
-
const fullContent = segment.content;
|
|
3070
|
-
const pageCount = toIdx - fromIdx + 1;
|
|
3071
|
-
const isAligned = checkFastPathAlignment(cumulativeOffsets, fullContent, fromIdx, toIdx, pageCount, logger);
|
|
3072
|
-
const isPageBoundaryOnly = expandedBreakpoints.every((bp) => bp.regex === null && bp.excludeSet.size === 0 && bp.skipWhenRegex === null);
|
|
3073
|
-
if (pageCount < 1e3 || !isAligned || !isPageBoundaryOnly || maxContentLength || debugMetaKey) return null;
|
|
3074
|
-
if (maxPages === 0) return processTrivialFastPath(fromIdx, toIdx, pageIds, normalizedPages, pageCount, segment.meta, debugMetaKey, logger);
|
|
3075
|
-
return processOffsetFastPath(fullContent, fromIdx, toIdx, pageIds, cumulativeOffsets, maxPages, segment.meta, debugMetaKey, logger);
|
|
3076
|
-
};
|
|
3077
|
-
/**
|
|
3078
|
-
* For maxPages=0 with maxContentLength: if current page's remaining content fits,
|
|
3079
|
-
* create a segment and advance to next page without applying breakpoints.
|
|
3080
|
-
*/
|
|
3081
|
-
const tryHandleCurrentPageFit = (fullContent, cursorPos, currentFromIdx, fromIdx, actualRemainingEndIdx, boundaryPositions, pageIds, expandedBreakpoints, maxPages, maxContentLength, isFirstPiece, debugMetaKey, segmentMeta, lastBreakpoint, result) => {
|
|
3082
|
-
if (maxPages !== 0 || !maxContentLength || currentFromIdx >= actualRemainingEndIdx) return { handled: false };
|
|
3083
|
-
const currentPageEndPos = boundaryPositions[currentFromIdx - fromIdx + 1] ?? fullContent.length;
|
|
3084
|
-
const currentPageRemainingContent = fullContent.slice(cursorPos, currentPageEndPos).trim();
|
|
3085
|
-
if (!currentPageRemainingContent) return { handled: false };
|
|
3086
|
-
const currentPageFitsInLength = currentPageRemainingContent.length <= maxContentLength;
|
|
3087
|
-
const currentPageHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, currentFromIdx);
|
|
3088
|
-
if (!currentPageFitsInLength || currentPageHasExclusions) return { handled: false };
|
|
3089
|
-
const pageBoundaryIdx = expandedBreakpoints.findIndex((bp) => bp.regex === null);
|
|
3090
|
-
const pageBoundaryBreakpoint = pageBoundaryIdx >= 0 ? {
|
|
3091
|
-
breakpointIndex: pageBoundaryIdx,
|
|
3092
|
-
rule: { pattern: "" }
|
|
3093
|
-
} : lastBreakpoint;
|
|
3094
|
-
const includeMeta = isFirstPiece || Boolean(debugMetaKey);
|
|
3095
|
-
const meta = getSegmentMetaWithDebug(isFirstPiece, debugMetaKey, segmentMeta, pageBoundaryBreakpoint);
|
|
3096
|
-
const seg = createSegment(currentPageRemainingContent, pageIds[currentFromIdx], void 0, includeMeta ? meta : void 0);
|
|
3097
|
-
if (seg) result.push(seg);
|
|
3098
|
-
let newCursorPos = currentPageEndPos;
|
|
3099
|
-
while (newCursorPos < fullContent.length && /\s/.test(fullContent[newCursorPos])) newCursorPos++;
|
|
3100
|
-
return {
|
|
3101
|
-
handled: true,
|
|
3102
|
-
newCursorPos,
|
|
3103
|
-
newFromIdx: currentFromIdx + 1,
|
|
3104
|
-
newLastBreakpoint: pageBoundaryBreakpoint
|
|
3105
|
-
};
|
|
3106
|
-
};
|
|
3107
|
-
const processOversizedSegmentIterative = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey, maxContentLength) => {
|
|
3108
|
-
const result = [];
|
|
3109
|
-
const fullContent = segment.content;
|
|
3110
|
-
const pageCount = toIdx - fromIdx + 1;
|
|
3111
|
-
logger?.debug?.("[breakpoints] processOversizedSegment: Using iterative path", {
|
|
3112
|
-
contentLength: fullContent.length,
|
|
3113
|
-
fromIdx,
|
|
3114
|
-
maxContentLength,
|
|
3115
|
-
maxPages,
|
|
3116
|
-
pageCount,
|
|
3117
|
-
toIdx
|
|
3118
|
-
});
|
|
3119
|
-
let cursorPos = 0;
|
|
3120
|
-
let currentFromIdx = fromIdx;
|
|
3121
|
-
let isFirstPiece = true;
|
|
3122
|
-
let lastBreakpoint = null;
|
|
3123
|
-
const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger);
|
|
3124
|
-
logger?.debug?.("[breakpoints] boundaryPositions built", {
|
|
3125
|
-
boundaryPositions,
|
|
3126
|
-
fromIdx,
|
|
3127
|
-
fullContentLength: fullContent.length,
|
|
3128
|
-
toIdx
|
|
3129
|
-
});
|
|
3130
|
-
const MAX_SAFE_ITERATIONS = 1e5;
|
|
3131
|
-
let didHitMaxIterations = true;
|
|
3132
|
-
for (let i = 1; i <= MAX_SAFE_ITERATIONS; i++) {
|
|
3133
|
-
if (cursorPos >= fullContent.length || currentFromIdx > toIdx) {
|
|
3134
|
-
didHitMaxIterations = false;
|
|
3135
|
-
break;
|
|
3136
|
-
}
|
|
3137
|
-
const { remainingContent, windowEndIdx } = computeIterationWindow(fullContent, cursorPos, currentFromIdx, fromIdx, toIdx, pageIds, boundaryPositions, maxPages, maxContentLength);
|
|
3138
|
-
if (!remainingContent.trim()) {
|
|
3139
|
-
didHitMaxIterations = false;
|
|
3140
|
-
break;
|
|
3141
|
-
}
|
|
3142
|
-
const actualRemainingContent = fullContent.slice(cursorPos);
|
|
3143
|
-
const actualEndPos = Math.max(cursorPos, fullContent.length - 1);
|
|
3144
|
-
const actualRemainingEndIdx = Math.min(findPageIndexForPosition(actualEndPos, boundaryPositions, fromIdx), toIdx);
|
|
3145
|
-
const currentPageFit = tryHandleCurrentPageFit(fullContent, cursorPos, currentFromIdx, fromIdx, actualRemainingEndIdx, boundaryPositions, pageIds, expandedBreakpoints, maxPages, maxContentLength, isFirstPiece, debugMetaKey, segment.meta, lastBreakpoint, result);
|
|
3146
|
-
if (currentPageFit.handled) {
|
|
3147
|
-
cursorPos = currentPageFit.newCursorPos;
|
|
3148
|
-
currentFromIdx = currentPageFit.newFromIdx;
|
|
3149
|
-
lastBreakpoint = currentPageFit.newLastBreakpoint;
|
|
3150
|
-
isFirstPiece = false;
|
|
3151
|
-
continue;
|
|
3152
|
-
}
|
|
3153
|
-
if (handleOversizedSegmentFit(actualRemainingContent, currentFromIdx, actualRemainingEndIdx, pageIds, expandedBreakpoints, maxPages, maxContentLength, isFirstPiece, debugMetaKey, segment.meta, lastBreakpoint, result)) {
|
|
3154
|
-
didHitMaxIterations = false;
|
|
3155
|
-
break;
|
|
3156
|
-
}
|
|
3157
|
-
const windowEndPosition = computeWindowEndPositionForIteration(remainingContent, cursorPos, currentFromIdx, fromIdx, windowEndIdx, toIdx, pageIds, boundaryPositions, normalizedPages, cumulativeOffsets, maxPages, maxContentLength, logger);
|
|
3158
|
-
logger?.trace?.(`[breakpoints] iteration=${i}`, {
|
|
3159
|
-
currentFromIdx,
|
|
3160
|
-
cursorPos,
|
|
3161
|
-
windowEndIdx,
|
|
3162
|
-
windowEndPosition
|
|
3163
|
-
});
|
|
3164
|
-
const found = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer, maxContentLength);
|
|
3165
|
-
const breakOffset = ensureProgressingBreakOffset(found.breakOffset, remainingContent, cursorPos, maxContentLength, logger);
|
|
3166
|
-
lastBreakpoint = updateLastBreakpointFromFound(found, lastBreakpoint);
|
|
3167
|
-
const breakPos = cursorPos + breakOffset;
|
|
3168
|
-
const pieceContent = fullContent.slice(cursorPos, breakPos).trim();
|
|
3169
|
-
if (!pieceContent) {
|
|
3170
|
-
cursorPos = breakPos;
|
|
3171
|
-
isFirstPiece = false;
|
|
3172
|
-
continue;
|
|
3173
|
-
}
|
|
3174
|
-
const next = appendPieceAndAdvance(fullContent, cursorPos, breakPos, pieceContent, currentFromIdx, fromIdx, toIdx, pageIds, boundaryPositions, normalizedPages, maxPages, isFirstPiece, debugMetaKey, segment.meta, lastBreakpoint, result, logger, found.contentLengthSplit);
|
|
3175
|
-
cursorPos = next.cursorPos;
|
|
3176
|
-
currentFromIdx = next.currentFromIdx;
|
|
3177
|
-
isFirstPiece = false;
|
|
3178
|
-
}
|
|
3179
|
-
if (didHitMaxIterations) logger?.error?.("[breakpoints] Stopped processing oversized segment: reached MAX_SAFE_ITERATIONS", {
|
|
3180
|
-
cursorPos,
|
|
3181
|
-
fullContentLength: fullContent.length,
|
|
3182
|
-
iterations: MAX_SAFE_ITERATIONS
|
|
3183
|
-
});
|
|
3184
|
-
logger?.debug?.("[breakpoints] processOversizedSegment: Complete", { resultCount: result.length });
|
|
3185
|
-
return result;
|
|
3186
|
-
};
|
|
3187
|
-
/**
|
|
3188
|
-
* Applies breakpoints to oversized segments.
|
|
3189
|
-
*
|
|
3190
|
-
* Note: This is an internal engine used by `segmentPages()`.
|
|
3191
|
-
*/
|
|
3192
|
-
/**
|
|
3193
|
-
* Processes an oversized segment by iterating through the content and
|
|
3194
|
-
* breaking it into smaller pieces that fit within maxPages constraints.
|
|
3195
|
-
*
|
|
3196
|
-
* Uses precomputed boundary positions for O(log n) page attribution lookups.
|
|
3197
|
-
*/
|
|
3198
|
-
const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey, maxContentLength) => {
|
|
3199
|
-
const fast = tryProcessOversizedSegmentFastPath(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, logger, debugMetaKey, maxContentLength);
|
|
3200
|
-
if (fast) return fast;
|
|
3201
|
-
return processOversizedSegmentIterative(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey, maxContentLength);
|
|
3202
|
-
};
|
|
3203
|
-
const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space", debugMetaKey, maxContentLength, rawPatternProcessor) => {
|
|
3204
|
-
const pageIds = pages.map((p) => p.id);
|
|
3205
|
-
const pageIdToIndex = buildPageIdToIndexMap(pageIds);
|
|
3206
|
-
const normalizedPages = buildNormalizedPagesMap(pages, normalizedContent);
|
|
3207
|
-
const cumulativeOffsets = buildCumulativeOffsets(pageIds, normalizedPages);
|
|
3208
|
-
const expandedBreakpoints = expandBreakpoints(breakpoints, patternProcessor, rawPatternProcessor);
|
|
3209
|
-
const result = [];
|
|
3210
|
-
logger?.info?.("Starting breakpoint processing", {
|
|
3211
|
-
maxPages,
|
|
3212
|
-
segmentCount: segments.length
|
|
3213
|
-
});
|
|
3214
|
-
logger?.debug?.("[breakpoints] inputSegments", {
|
|
3215
|
-
segmentCount: segments.length,
|
|
3216
|
-
segments: segments.map((s) => ({
|
|
3217
|
-
contentLength: s.content.length,
|
|
3218
|
-
from: s.from,
|
|
3219
|
-
to: s.to
|
|
3220
|
-
}))
|
|
3221
|
-
});
|
|
3222
|
-
for (const segment of segments) {
|
|
3223
|
-
const fromIdx = pageIdToIndex.get(segment.from) ?? -1;
|
|
3224
|
-
const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
|
|
3225
|
-
const segmentSpan = (segment.to ?? segment.from) - segment.from;
|
|
3226
|
-
const hasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, fromIdx, toIdx);
|
|
3227
|
-
const fitsInPages = segmentSpan <= maxPages;
|
|
3228
|
-
const fitsInLength = !maxContentLength || segment.content.length <= maxContentLength;
|
|
3229
|
-
if (fitsInPages && fitsInLength && !hasExclusions) {
|
|
3230
|
-
result.push(segment);
|
|
3231
|
-
continue;
|
|
3232
|
-
}
|
|
3233
|
-
logger?.debug?.("[breakpoints] Processing oversized segment", {
|
|
3234
|
-
contentLength: segment.content.length,
|
|
3235
|
-
from: segment.from,
|
|
3236
|
-
hasExclusions,
|
|
3237
|
-
pageSpan: toIdx - fromIdx + 1,
|
|
3238
|
-
reasonFitsInLength: fitsInLength,
|
|
3239
|
-
reasonFitsInPages: fitsInPages,
|
|
3240
|
-
to: segment.to
|
|
3241
|
-
});
|
|
3242
|
-
const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey, maxContentLength);
|
|
3243
|
-
result.push(...broken.map((s) => {
|
|
3244
|
-
const segFromIdx = pageIdToIndex.get(s.from) ?? -1;
|
|
3245
|
-
const segToIdx = s.to !== void 0 ? pageIdToIndex.get(s.to) ?? segFromIdx : segFromIdx;
|
|
3246
|
-
if (segFromIdx >= 0 && segToIdx > segFromIdx) return {
|
|
3247
|
-
...s,
|
|
3248
|
-
content: applyPageJoinerBetweenPages(s.content, segFromIdx, segToIdx, pageIds, normalizedPages, pageJoiner)
|
|
3249
|
-
};
|
|
3250
|
-
return s;
|
|
3251
|
-
}));
|
|
3252
|
-
}
|
|
3253
|
-
logger?.info?.("Breakpoint processing completed", { resultCount: result.length });
|
|
3254
|
-
return result;
|
|
3255
|
-
};
|
|
3256
|
-
//#endregion
|
|
3257
|
-
//#region src/segmentation/rule-regex.ts
|
|
3258
|
-
/**
|
|
3259
|
-
* Checks if a regex pattern contains standard (anonymous) capturing groups.
|
|
3260
|
-
*
|
|
3261
|
-
* Detects standard capturing groups `(...)` while excluding:
|
|
3262
|
-
* - Non-capturing groups `(?:...)`
|
|
3263
|
-
* - Lookahead assertions `(?=...)` and `(?!...)`
|
|
3264
|
-
* - Lookbehind assertions `(?<=...)` and `(?<!...)`
|
|
3265
|
-
* - Named groups `(?<name>...)` (start with `(?` so excluded here)
|
|
3266
|
-
*
|
|
3267
|
-
* NOTE: Named capture groups are still captures, but they're tracked via `captureNames`.
|
|
3268
|
-
*/
|
|
3269
|
-
const hasCapturingGroup = (pattern) => /\((?!\?)/.test(pattern);
|
|
3270
|
-
/**
|
|
3271
|
-
* Extracts named capture group names from a regex pattern.
|
|
3272
|
-
*
|
|
3273
|
-
* Parses patterns like `(?<num>[0-9]+)` and returns `['num']`.
|
|
3274
|
-
*
|
|
3275
|
-
* @example
|
|
3276
|
-
* extractNamedCaptureNames('^(?<num>[٠-٩]+)\\s+') // ['num']
|
|
3277
|
-
* extractNamedCaptureNames('^(?<a>\\d+)(?<b>\\w+)') // ['a', 'b']
|
|
3278
|
-
* extractNamedCaptureNames('^\\d+') // []
|
|
3279
|
-
*/
|
|
3280
|
-
const extractNamedCaptureNames = (pattern) => [...pattern.matchAll(/\(\?<([A-Za-z_]\w*)>/g)].map((m) => m[1]).filter((n) => !n.startsWith("_r") && !n.startsWith("_w"));
|
|
3281
|
-
/**
|
|
3282
|
-
* Safely compiles a regex pattern, throwing a helpful error if invalid.
|
|
3283
|
-
*/
|
|
3284
|
-
const compileRuleRegex = (pattern) => {
|
|
3285
|
-
try {
|
|
3286
|
-
return new RegExp(pattern, "gmu");
|
|
3287
|
-
} catch (error) {
|
|
3288
|
-
throw new Error(`Invalid regex pattern: ${pattern}\n Cause: ${error instanceof Error ? error.message : String(error)}`);
|
|
3289
|
-
}
|
|
3290
|
-
};
|
|
3291
|
-
/**
|
|
3292
|
-
* Processes a pattern string by expanding tokens and optionally applying fuzzy matching.
|
|
3293
|
-
*
|
|
3294
|
-
* Brackets `()[]` outside `{{tokens}}` are auto-escaped.
|
|
3295
|
-
*/
|
|
3296
|
-
const processPattern = (pattern, fuzzy, capturePrefix) => {
|
|
3297
|
-
const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0, capturePrefix);
|
|
3298
|
-
return {
|
|
3299
|
-
captureNames,
|
|
3300
|
-
pattern: expanded
|
|
3301
|
-
};
|
|
3302
|
-
};
|
|
3303
|
-
/**
|
|
3304
|
-
* Processes a breakpoint pattern by expanding tokens only.
|
|
3305
|
-
*
|
|
3306
|
-
* Unlike `processPattern`, this does NOT escape brackets because breakpoints
|
|
3307
|
-
* are treated as raw regex patterns (like the `regex` rule type).
|
|
3308
|
-
* Users have full control over regex syntax including `(?:...)` groups.
|
|
3309
|
-
*/
|
|
3310
|
-
const processBreakpointPattern = (pattern) => {
|
|
3311
|
-
const { pattern: expanded } = expandTokensWithCaptures(pattern);
|
|
3312
|
-
return expanded;
|
|
3313
|
-
};
|
|
3314
|
-
/**
|
|
3315
|
-
* Builds the raw regex source for a `lineStartsAfter` rule.
|
|
3316
|
-
*
|
|
3317
|
-
* Expands each pattern through `processPattern()`, combines them into an
|
|
3318
|
-
* alternation at the start of a line, and appends a trailing content capture.
|
|
3319
|
-
*
|
|
3320
|
-
* @param patterns - Template-like line-start markers to match
|
|
3321
|
-
* @param fuzzy - Whether Arabic fuzzy matching should be applied during expansion
|
|
3322
|
-
* @param capturePrefix - Optional prefix used for internal named captures
|
|
3323
|
-
* @returns Regex source plus the named captures extracted from the patterns
|
|
3324
|
-
*/
|
|
3325
|
-
const buildLineStartsAfterRegexSource = (patterns, fuzzy, capturePrefix) => {
|
|
3326
|
-
const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
|
|
3327
|
-
const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
|
|
3328
|
-
return {
|
|
3329
|
-
captureNames: processed.flatMap((p) => p.captureNames),
|
|
3330
|
-
regex: `^[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*(?:${alternatives})${capturePrefix ? `(?<${capturePrefix}__content>.*)` : "(.*)"}`
|
|
3331
|
-
};
|
|
3332
|
-
};
|
|
3333
|
-
/**
|
|
3334
|
-
* Builds the raw regex source for a `lineStartsWith` rule.
|
|
3335
|
-
*
|
|
3336
|
-
* Expands each pattern through `processPattern()` and combines them into an
|
|
3337
|
-
* alternation anchored at the start of a line.
|
|
3338
|
-
*
|
|
3339
|
-
* @param patterns - Template-like line-start markers to match
|
|
3340
|
-
* @param fuzzy - Whether Arabic fuzzy matching should be applied during expansion
|
|
3341
|
-
* @param capturePrefix - Optional prefix used for internal named captures
|
|
3342
|
-
* @returns Regex source plus the named captures extracted from the patterns
|
|
3343
|
-
*/
|
|
3344
|
-
const buildLineStartsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
|
|
3345
|
-
const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
|
|
3346
|
-
const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
|
|
3347
|
-
return {
|
|
3348
|
-
captureNames: processed.flatMap((p) => p.captureNames),
|
|
3349
|
-
regex: `^[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*(?:${alternatives})`
|
|
3350
|
-
};
|
|
3351
|
-
};
|
|
3352
|
-
/**
|
|
3353
|
-
* Builds the raw regex source for a `lineEndsWith` rule.
|
|
3354
|
-
*
|
|
3355
|
-
* Expands each pattern through `processPattern()` and combines them into an
|
|
3356
|
-
* end-anchored alternation.
|
|
3357
|
-
*
|
|
3358
|
-
* @param patterns - Template-like line-end markers to match
|
|
3359
|
-
* @param fuzzy - Whether Arabic fuzzy matching should be applied during expansion
|
|
3360
|
-
* @param capturePrefix - Optional prefix used for internal named captures
|
|
3361
|
-
* @returns Regex source plus the named captures extracted from the patterns
|
|
3362
|
-
*/
|
|
3363
|
-
const buildLineEndsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
|
|
3364
|
-
const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
|
|
3365
|
-
const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
|
|
3366
|
-
return {
|
|
3367
|
-
captureNames: processed.flatMap((p) => p.captureNames),
|
|
3368
|
-
regex: `(?:${alternatives})$`
|
|
3369
|
-
};
|
|
3370
|
-
};
|
|
3371
|
-
/**
|
|
3372
|
-
* Builds the raw regex source for a `template` rule.
|
|
3373
|
-
*
|
|
3374
|
-
* Expands tokens and named captures via `expandTokensWithCaptures()` after
|
|
3375
|
-
* applying `escapeTemplateBrackets()` to non-token brackets.
|
|
3376
|
-
*
|
|
3377
|
-
* @param template - Template string containing optional `{{token}}` markers
|
|
3378
|
-
* @param capturePrefix - Optional prefix used for internal named captures
|
|
3379
|
-
* @returns Regex source plus the named captures extracted from the template
|
|
3380
|
-
*/
|
|
3381
|
-
const buildTemplateRegexSource = (template, capturePrefix) => {
|
|
3382
|
-
const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template), void 0, capturePrefix);
|
|
3383
|
-
return {
|
|
3384
|
-
captureNames,
|
|
3385
|
-
regex: pattern
|
|
3386
|
-
};
|
|
3387
|
-
};
|
|
3388
|
-
const getFuzzyCandidatePatterns = (rule) => [
|
|
3389
|
-
..."lineStartsWith" in rule && Array.isArray(rule.lineStartsWith) ? rule.lineStartsWith : [],
|
|
3390
|
-
..."lineStartsAfter" in rule && Array.isArray(rule.lineStartsAfter) ? rule.lineStartsAfter : [],
|
|
3391
|
-
..."lineEndsWith" in rule && Array.isArray(rule.lineEndsWith) ? rule.lineEndsWith : []
|
|
3392
|
-
];
|
|
3393
|
-
const buildLineBasedRuleRegex = (rule, fuzzy, capturePrefix) => {
|
|
3394
|
-
if ("lineStartsWith" in rule && Array.isArray(rule.lineStartsWith) && rule.lineStartsWith.length > 0) return buildLineStartsWithRegexSource(rule.lineStartsWith, fuzzy, capturePrefix);
|
|
3395
|
-
if ("lineEndsWith" in rule && Array.isArray(rule.lineEndsWith) && rule.lineEndsWith.length > 0) return buildLineEndsWithRegexSource(rule.lineEndsWith, fuzzy, capturePrefix);
|
|
3396
|
-
if ("template" in rule && typeof rule.template === "string") return buildTemplateRegexSource(rule.template, capturePrefix);
|
|
3397
|
-
if ("dictionaryEntry" in rule && rule.dictionaryEntry) return buildArabicDictionaryEntryRegexSource(rule.dictionaryEntry, capturePrefix);
|
|
3398
|
-
return null;
|
|
3399
|
-
};
|
|
3400
|
-
/**
|
|
3401
|
-
* Builds a compiled regex and metadata from a split rule.
|
|
3402
|
-
*
|
|
3403
|
-
* Behavior mirrors the previous implementation in `segmenter.ts`.
|
|
3404
|
-
*/
|
|
3405
|
-
const buildRuleRegex = (rule, capturePrefix) => {
|
|
3406
|
-
const fuzzy = rule.fuzzy ?? shouldDefaultToFuzzy(getFuzzyCandidatePatterns(rule));
|
|
3407
|
-
if ("lineStartsAfter" in rule && Array.isArray(rule.lineStartsAfter) && rule.lineStartsAfter.length > 0) {
|
|
3408
|
-
const { regex: lsaRegex, captureNames } = buildLineStartsAfterRegexSource(rule.lineStartsAfter, fuzzy, capturePrefix);
|
|
3409
|
-
return {
|
|
3410
|
-
captureNames,
|
|
3411
|
-
regex: compileRuleRegex(lsaRegex),
|
|
3412
|
-
usesCapture: true,
|
|
3413
|
-
usesLineStartsAfter: true
|
|
3414
|
-
};
|
|
3415
|
-
}
|
|
3416
|
-
const ruleRegexSource = buildLineBasedRuleRegex(rule, fuzzy, capturePrefix);
|
|
3417
|
-
let finalRegex = ruleRegexSource?.regex;
|
|
3418
|
-
let allCaptureNames = ruleRegexSource?.captureNames ?? [];
|
|
3419
|
-
if (!finalRegex && "regex" in rule && typeof rule.regex === "string") finalRegex = rule.regex;
|
|
3420
|
-
if (!finalRegex) throw new Error("Rule must specify exactly one pattern type: regex, template, lineStartsWith, lineStartsAfter, lineEndsWith, or dictionaryEntry");
|
|
3421
|
-
if (allCaptureNames.length === 0) allCaptureNames = extractNamedCaptureNames(finalRegex);
|
|
3422
|
-
return {
|
|
3423
|
-
captureNames: allCaptureNames,
|
|
3424
|
-
regex: compileRuleRegex(finalRegex),
|
|
3425
|
-
usesCapture: hasCapturingGroup(finalRegex),
|
|
3426
|
-
usesLineStartsAfter: false
|
|
3427
|
-
};
|
|
3428
|
-
};
|
|
3429
|
-
//#endregion
|
|
3430
|
-
//#region src/segmentation/fast-fuzzy-prefix.ts
|
|
3431
|
-
/**
|
|
3432
|
-
* Fast-path fuzzy prefix matching for common Arabic line-start markers.
|
|
3433
|
-
*
|
|
3434
|
-
* This exists to avoid running expensive fuzzy-expanded regex alternations over
|
|
3435
|
-
* a giant concatenated string. Instead, we match only at known line-start
|
|
3436
|
-
* offsets and perform a small deterministic comparison:
|
|
3437
|
-
* - Skip Arabic diacritics in the CONTENT
|
|
3438
|
-
* - Treat common equivalence groups as equal (ا/آ/أ/إ, ة/ه, ى/ي)
|
|
3439
|
-
*
|
|
3440
|
-
* This module is intentionally conservative: it only supports "literal"
|
|
3441
|
-
* token patterns (plain text alternation via `|`), not general regex.
|
|
3442
|
-
*/
|
|
3443
|
-
const isArabicDiacriticCode = (code) => code >= 1611 && code <= 1618;
|
|
3444
|
-
const equivKey = (ch) => {
|
|
3445
|
-
switch (ch) {
|
|
3446
|
-
case "آ":
|
|
3447
|
-
case "أ":
|
|
3448
|
-
case "إ": return "ا";
|
|
3449
|
-
case "ه": return "ة";
|
|
3450
|
-
case "ي": return "ى";
|
|
3451
|
-
default: return ch;
|
|
3452
|
-
}
|
|
3453
|
-
};
|
|
3454
|
-
const matchFuzzyLiteralPrefixAt = (content, offset, literal) => {
|
|
3455
|
-
let i = offset;
|
|
3456
|
-
while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
|
|
3457
|
-
for (let j = 0; j < literal.length; j++) {
|
|
3458
|
-
const litCh = literal[j];
|
|
3459
|
-
while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
|
|
3460
|
-
if (i >= content.length || equivKey(content[i]) !== equivKey(litCh)) return null;
|
|
3461
|
-
i++;
|
|
3462
|
-
}
|
|
3463
|
-
while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
|
|
3464
|
-
return i;
|
|
3465
|
-
};
|
|
3466
|
-
const isLiteralOnly = (s) => !/[\\[\]{}()^$.*+?]/.test(s);
|
|
3467
|
-
const compileLiteralAlternation = (pattern) => {
|
|
3468
|
-
if (!pattern || !isLiteralOnly(pattern)) return null;
|
|
3469
|
-
const alternatives = pattern.split("|").map((s) => s.trim()).filter(Boolean);
|
|
3470
|
-
return alternatives.length ? { alternatives } : null;
|
|
3471
|
-
};
|
|
3472
|
-
const compileFastFuzzyTokenRule = (tokenTemplate) => {
|
|
3473
|
-
const m = tokenTemplate.match(/^\{\{(\w+)\}\}$/);
|
|
3474
|
-
if (!m) return null;
|
|
3475
|
-
const token = m[1];
|
|
3476
|
-
if (!(token in TOKEN_PATTERNS)) return null;
|
|
3477
|
-
const compiled = compileLiteralAlternation(getTokenPattern(token));
|
|
3478
|
-
return compiled ? {
|
|
3479
|
-
alternatives: compiled.alternatives,
|
|
3480
|
-
token
|
|
3481
|
-
} : null;
|
|
3482
|
-
};
|
|
3483
|
-
const matchFastFuzzyTokenAt = (content, offset, compiled) => {
|
|
3484
|
-
for (const alt of compiled.alternatives) {
|
|
3485
|
-
const end = matchFuzzyLiteralPrefixAt(content, offset, alt);
|
|
3486
|
-
if (end !== null) return end;
|
|
3487
|
-
}
|
|
3488
|
-
return null;
|
|
3489
|
-
};
|
|
3490
|
-
//#endregion
|
|
3491
|
-
//#region src/segmentation/segmenter-rule-utils.ts
|
|
3492
|
-
const tryCompileFastFuzzyRule = (rule) => {
|
|
3493
|
-
const fuzzyCandidatePatterns = [..."lineStartsWith" in rule ? rule.lineStartsWith : [], ..."lineStartsAfter" in rule ? rule.lineStartsAfter : []];
|
|
3494
|
-
if (!(rule.fuzzy ?? shouldDefaultToFuzzy(fuzzyCandidatePatterns))) return null;
|
|
3495
|
-
if ("lineStartsWith" in rule && rule.lineStartsWith?.length === 1) {
|
|
3496
|
-
const compiled = compileFastFuzzyTokenRule(rule.lineStartsWith[0]);
|
|
3497
|
-
if (compiled) return {
|
|
3498
|
-
compiled,
|
|
3499
|
-
kind: "startsWith"
|
|
3500
|
-
};
|
|
3501
|
-
}
|
|
3502
|
-
if ("lineStartsAfter" in rule && rule.lineStartsAfter?.length === 1) {
|
|
3503
|
-
const compiled = compileFastFuzzyTokenRule(rule.lineStartsAfter[0]);
|
|
3504
|
-
if (compiled) return {
|
|
3505
|
-
compiled,
|
|
3506
|
-
kind: "startsAfter"
|
|
3507
|
-
};
|
|
3508
|
-
}
|
|
3509
|
-
return null;
|
|
3510
|
-
};
|
|
3511
|
-
const isCombinableRule = (rule) => {
|
|
3512
|
-
if ("regex" in rule && rule.regex) return extractNamedCaptureNames(rule.regex).length === 0 && !/\\[1-9]/.test(rule.regex) && !hasCapturingGroup(rule.regex);
|
|
3513
|
-
return true;
|
|
3514
|
-
};
|
|
3515
|
-
const partitionRulesForMatching = (rules) => {
|
|
3516
|
-
const combinableRules = [];
|
|
3517
|
-
const standaloneRules = [];
|
|
3518
|
-
const fastFuzzyRules = [];
|
|
3519
|
-
for (let index = 0; index < rules.length; index++) {
|
|
3520
|
-
const rule = rules[index];
|
|
3521
|
-
const fuzzyComp = tryCompileFastFuzzyRule(rule);
|
|
3522
|
-
if (fuzzyComp) {
|
|
3523
|
-
fastFuzzyRules.push({
|
|
3524
|
-
compiled: fuzzyComp.compiled,
|
|
3525
|
-
kind: fuzzyComp.kind,
|
|
3526
|
-
rule,
|
|
3527
|
-
ruleIndex: index
|
|
3528
|
-
});
|
|
3529
|
-
continue;
|
|
3530
|
-
}
|
|
3531
|
-
if (isCombinableRule(rule)) combinableRules.push({
|
|
3532
|
-
index,
|
|
3533
|
-
prefix: `r${index}_`,
|
|
3534
|
-
rule
|
|
3535
|
-
});
|
|
3536
|
-
else standaloneRules.push({
|
|
3537
|
-
index,
|
|
3538
|
-
rule
|
|
3539
|
-
});
|
|
3540
|
-
}
|
|
3541
|
-
return {
|
|
3542
|
-
combinableRules,
|
|
3543
|
-
fastFuzzyRules,
|
|
3544
|
-
standaloneRules
|
|
3545
|
-
};
|
|
3546
|
-
};
|
|
3547
|
-
const STRONG_SENTENCE_TERMINATORS = /[.!?؟؛۔…]$/u;
|
|
3548
|
-
const TRAILING_PAGE_WRAP_NOISE = /[\s\u0660-\u0669\d«»"“”'‘’()[\]{}<>]+$/u;
|
|
3549
|
-
const TRAILING_WORD_DELIMITERS = /[\s\u0660-\u0669\d«»"“”'‘’()[\]{}<>.,!?؟؛،:]+$/u;
|
|
3550
|
-
const ARABIC_WORD_REGEX = new RegExp(ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, "gu");
|
|
3551
|
-
const trimTrailingPageWrapNoise = (text) => {
|
|
3552
|
-
let trimmed = text.trimEnd();
|
|
3553
|
-
while (trimmed !== trimmed.replace(TRAILING_PAGE_WRAP_NOISE, "")) trimmed = trimmed.replace(TRAILING_PAGE_WRAP_NOISE, "");
|
|
3554
|
-
return trimmed;
|
|
3555
|
-
};
|
|
3556
|
-
const endsWithStrongSentenceTerminator = (pageContent) => {
|
|
3557
|
-
return STRONG_SENTENCE_TERMINATORS.test(trimTrailingPageWrapNoise(pageContent));
|
|
3558
|
-
};
|
|
3559
|
-
const extractLastArabicWord = (pageContent) => {
|
|
3560
|
-
return [...trimTrailingPageWrapNoise(pageContent).replace(TRAILING_WORD_DELIMITERS, "").matchAll(ARABIC_WORD_REGEX)].at(-1)?.[0] ?? "";
|
|
3561
|
-
};
|
|
3562
|
-
const shouldAllowPageStartMatch = (previousPageContent, prevWordStoplist) => {
|
|
3563
|
-
if (!prevWordStoplist || endsWithStrongSentenceTerminator(previousPageContent)) return true;
|
|
3564
|
-
const lastWord = extractLastArabicWord(previousPageContent);
|
|
3565
|
-
return !lastWord || !prevWordStoplist.has(normalizeArabicForComparison(lastWord));
|
|
3566
|
-
};
|
|
3567
|
-
const shouldAllowSamePageMatch = (contentBeforeMatch, stoplist) => {
|
|
3568
|
-
if (!stoplist) return true;
|
|
3569
|
-
const lastWord = extractLastArabicWord(contentBeforeMatch);
|
|
3570
|
-
return !lastWord || !stoplist.has(normalizeArabicForComparison(lastWord));
|
|
3571
|
-
};
|
|
3572
|
-
const createPageStartGuardChecker = (matchContent, pageMap) => {
|
|
3573
|
-
const pageStartToBoundaryIndex = new Map(pageMap.boundaries.map((b, i) => [b.start, i]));
|
|
3574
|
-
const compiledPageStartPrev = /* @__PURE__ */ new Map();
|
|
3575
|
-
const compiledPrevWordStoplists = /* @__PURE__ */ new Map();
|
|
3576
|
-
const compiledSamePagePrevWordStoplists = /* @__PURE__ */ new Map();
|
|
3577
|
-
const pageIdToBoundaryIndex = new Map(pageMap.boundaries.map((b, i) => [b.id, i]));
|
|
3578
|
-
const getPageStartPrevRegex = (rule, ruleIndex) => {
|
|
3579
|
-
if (compiledPageStartPrev.has(ruleIndex)) return compiledPageStartPrev.get(ruleIndex) ?? null;
|
|
3580
|
-
const pattern = rule.pageStartGuard;
|
|
3581
|
-
if (!pattern) {
|
|
3582
|
-
compiledPageStartPrev.set(ruleIndex, null);
|
|
3583
|
-
return null;
|
|
3584
|
-
}
|
|
3585
|
-
const re = new RegExp(`(?:${processPattern(pattern, false).pattern})$`, "u");
|
|
3586
|
-
compiledPageStartPrev.set(ruleIndex, re);
|
|
3587
|
-
return re;
|
|
3588
|
-
};
|
|
3589
|
-
const getPrevWordStoplist = (rule, ruleIndex) => {
|
|
3590
|
-
if (compiledPrevWordStoplists.has(ruleIndex)) return compiledPrevWordStoplists.get(ruleIndex) ?? null;
|
|
3591
|
-
const stoplist = rule.pageStartPrevWordStoplist;
|
|
3592
|
-
if (!stoplist?.length) {
|
|
3593
|
-
compiledPrevWordStoplists.set(ruleIndex, null);
|
|
3594
|
-
return null;
|
|
3595
|
-
}
|
|
3596
|
-
const normalized = new Set(stoplist.map((word) => normalizeArabicForComparison(word)).filter(Boolean));
|
|
3597
|
-
compiledPrevWordStoplists.set(ruleIndex, normalized);
|
|
3598
|
-
return normalized;
|
|
3599
|
-
};
|
|
3600
|
-
const getSamePagePrevWordStoplist = (rule, ruleIndex) => {
|
|
3601
|
-
if (compiledSamePagePrevWordStoplists.has(ruleIndex)) return compiledSamePagePrevWordStoplists.get(ruleIndex) ?? null;
|
|
3602
|
-
const stoplist = rule.samePagePrevWordStoplist;
|
|
3603
|
-
if (!stoplist?.length) {
|
|
3604
|
-
compiledSamePagePrevWordStoplists.set(ruleIndex, null);
|
|
3605
|
-
return null;
|
|
3606
|
-
}
|
|
3607
|
-
const normalized = new Set(stoplist.map((word) => normalizeArabicForComparison(word)).filter(Boolean));
|
|
3608
|
-
compiledSamePagePrevWordStoplists.set(ruleIndex, normalized);
|
|
3609
|
-
return normalized;
|
|
3610
|
-
};
|
|
3611
|
-
const getPreviousPageContent = (boundaryIndex) => {
|
|
3612
|
-
if (boundaryIndex <= 0) return "";
|
|
3613
|
-
const prevBoundary = pageMap.boundaries[boundaryIndex - 1];
|
|
3614
|
-
return matchContent.slice(prevBoundary.start, prevBoundary.end);
|
|
3615
|
-
};
|
|
3616
|
-
const getPrevPageLastNonWsChar = (boundaryIndex) => {
|
|
3617
|
-
if (boundaryIndex <= 0) return "";
|
|
3618
|
-
const prevBoundary = pageMap.boundaries[boundaryIndex - 1];
|
|
3619
|
-
for (let i = prevBoundary.end - 1; i >= prevBoundary.start; i--) {
|
|
3620
|
-
const ch = matchContent[i];
|
|
3621
|
-
if (ch && !/\s/u.test(ch)) return ch;
|
|
3622
|
-
}
|
|
3623
|
-
return "";
|
|
3624
|
-
};
|
|
3625
|
-
const getCurrentPageContentBeforeMatch = (matchStart) => {
|
|
3626
|
-
const pageId = pageMap.getId(matchStart);
|
|
3627
|
-
const boundaryIndex = pageIdToBoundaryIndex.get(pageId);
|
|
3628
|
-
if (boundaryIndex === void 0) return "";
|
|
3629
|
-
const boundary = pageMap.boundaries[boundaryIndex];
|
|
3630
|
-
return matchContent.slice(boundary.start, matchStart);
|
|
3631
|
-
};
|
|
3632
|
-
return (rule, ruleIndex, matchStart) => {
|
|
3633
|
-
const boundaryIndex = pageStartToBoundaryIndex.get(matchStart);
|
|
3634
|
-
if (boundaryIndex !== void 0 && boundaryIndex !== 0) {
|
|
3635
|
-
const prevReq = getPageStartPrevRegex(rule, ruleIndex);
|
|
3636
|
-
if (prevReq) {
|
|
3637
|
-
const lastChar = getPrevPageLastNonWsChar(boundaryIndex);
|
|
3638
|
-
if (!lastChar || !prevReq.test(lastChar)) return false;
|
|
3639
|
-
}
|
|
3640
|
-
return shouldAllowPageStartMatch(getPreviousPageContent(boundaryIndex), getPrevWordStoplist(rule, ruleIndex));
|
|
3641
|
-
}
|
|
3642
|
-
return shouldAllowSamePageMatch(getCurrentPageContentBeforeMatch(matchStart), getSamePagePrevWordStoplist(rule, ruleIndex));
|
|
3643
|
-
};
|
|
3644
|
-
};
|
|
3645
|
-
/**
|
|
3646
|
-
* Checks if a pageId matches the min/max/exclude constraints of a rule.
|
|
3647
|
-
*/
|
|
3648
|
-
const passesRuleConstraints$1 = (rule, pageId) => (rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude);
|
|
3649
|
-
/**
|
|
3650
|
-
* Records a split point for a specific rule.
|
|
3651
|
-
*/
|
|
3652
|
-
const recordSplitPointAt = (splitPointsByRule, ruleIndex, sp) => {
|
|
3653
|
-
const arr = splitPointsByRule.get(ruleIndex);
|
|
3654
|
-
if (!arr) splitPointsByRule.set(ruleIndex, [sp]);
|
|
3655
|
-
else arr.push(sp);
|
|
3656
|
-
};
|
|
3657
|
-
const attemptFastFuzzyMatch = (matchContent, lineStart, { compiled, kind, rule, ruleIndex }, splitPointsByRule) => {
|
|
3658
|
-
const end = matchFastFuzzyTokenAt(matchContent, lineStart, compiled);
|
|
3659
|
-
if (end === null) return;
|
|
3660
|
-
const splitAt = rule.split ?? "at";
|
|
3661
|
-
const splitIndex = splitAt === "at" ? lineStart : end;
|
|
3662
|
-
if (kind === "startsWith") recordSplitPointAt(splitPointsByRule, ruleIndex, {
|
|
3663
|
-
index: splitIndex,
|
|
3664
|
-
meta: rule.meta
|
|
3665
|
-
});
|
|
3666
|
-
else {
|
|
3667
|
-
const markerLength = end - lineStart;
|
|
3668
|
-
recordSplitPointAt(splitPointsByRule, ruleIndex, {
|
|
3669
|
-
contentStartOffset: splitAt === "at" ? markerLength : void 0,
|
|
3670
|
-
index: splitIndex,
|
|
3671
|
-
meta: rule.meta
|
|
3672
|
-
});
|
|
3673
|
-
}
|
|
3674
|
-
};
|
|
3675
|
-
/**
|
|
3676
|
-
* Processes matches for all fast-fuzzy rules at a specific line start.
|
|
3677
|
-
*/
|
|
3678
|
-
const processFastFuzzyMatchesAt = (matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard, splitPointsByRule) => {
|
|
3679
|
-
for (const ffRule of fastFuzzyRules) {
|
|
3680
|
-
if (!passesRuleConstraints$1(ffRule.rule, pageId)) continue;
|
|
3681
|
-
if (!passesPageStartGuard(ffRule.rule, ffRule.ruleIndex, lineStart)) continue;
|
|
3682
|
-
attemptFastFuzzyMatch(matchContent, lineStart, ffRule, splitPointsByRule);
|
|
3683
|
-
}
|
|
3684
|
-
};
|
|
3685
|
-
const collectFastFuzzySplitPoints = (matchContent, pageMap, fastFuzzyRules, passesPageStartGuard) => {
|
|
3686
|
-
const splitPointsByRule = /* @__PURE__ */ new Map();
|
|
3687
|
-
if (fastFuzzyRules.length === 0 || pageMap.boundaries.length === 0) return splitPointsByRule;
|
|
3688
|
-
let boundaryIdx = 0;
|
|
3689
|
-
let currentBoundary = pageMap.boundaries[boundaryIdx];
|
|
3690
|
-
const advanceBoundaryTo = (offset) => {
|
|
3691
|
-
while (currentBoundary && offset > currentBoundary.end && boundaryIdx < pageMap.boundaries.length - 1) {
|
|
3692
|
-
boundaryIdx++;
|
|
3693
|
-
currentBoundary = pageMap.boundaries[boundaryIdx];
|
|
3694
|
-
}
|
|
3695
|
-
};
|
|
3696
|
-
for (let lineStart = 0; lineStart <= matchContent.length;) {
|
|
3697
|
-
advanceBoundaryTo(lineStart);
|
|
3698
|
-
const pageId = currentBoundary?.id ?? 0;
|
|
3699
|
-
if (lineStart >= matchContent.length) break;
|
|
3700
|
-
processFastFuzzyMatchesAt(matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard, splitPointsByRule);
|
|
3701
|
-
const nextNl = matchContent.indexOf("\n", lineStart);
|
|
3702
|
-
if (nextNl === -1) break;
|
|
3703
|
-
lineStart = nextNl + 1;
|
|
3704
|
-
}
|
|
3705
|
-
return splitPointsByRule;
|
|
3706
|
-
};
|
|
3707
|
-
//#endregion
|
|
3708
|
-
//#region src/segmentation/split-point-helpers.ts
|
|
3709
|
-
const MAX_REGEX_ITERATIONS = 1e5;
|
|
3710
|
-
const extractNamedCapturesForRule = (groups, captureNames, prefix) => {
|
|
3711
|
-
const result = {};
|
|
3712
|
-
if (!groups) return result;
|
|
3713
|
-
for (const name of captureNames) if (groups[name] !== void 0) result[name.slice(prefix.length)] = groups[name];
|
|
3714
|
-
return result;
|
|
3715
|
-
};
|
|
3716
|
-
const buildContentOffsets = (match, ruleInfo) => {
|
|
3717
|
-
if (!ruleInfo.usesLineStartsAfter) return {};
|
|
3718
|
-
const captured = match.groups?.[`${ruleInfo.prefix}__content`];
|
|
3719
|
-
if (captured === void 0) return {};
|
|
3720
|
-
return { contentStartOffset: (match.groups?.[ruleInfo.prefix] ?? match[0]).length - captured.length };
|
|
3721
|
-
};
|
|
3722
|
-
const passesRuleConstraints = (rule, pageId) => (rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude);
|
|
3723
|
-
const createSplitPointFromMatch = (match, rule, ruleInfo) => {
|
|
3724
|
-
const namedCaptures = extractNamedCapturesForRule(match.groups, ruleInfo.captureNames, ruleInfo.prefix);
|
|
3725
|
-
const wordIndex = extractDebugIndex(match.groups, "_r");
|
|
3726
|
-
return {
|
|
3727
|
-
capturedContent: void 0,
|
|
3728
|
-
contentStartOffset: buildContentOffsets(match, ruleInfo).contentStartOffset,
|
|
3729
|
-
index: (rule.split ?? "at") === "at" ? match.index : match.index + match[0].length,
|
|
3730
|
-
meta: rule.meta,
|
|
3731
|
-
namedCaptures: Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0,
|
|
3732
|
-
wordIndex
|
|
3733
|
-
};
|
|
3734
|
-
};
|
|
3735
|
-
const addSplitPoint = (splitPointsByRule, originalIndex, point) => {
|
|
3736
|
-
const arr = splitPointsByRule.get(originalIndex);
|
|
3737
|
-
if (!arr) {
|
|
3738
|
-
splitPointsByRule.set(originalIndex, [point]);
|
|
3739
|
-
return;
|
|
3740
|
-
}
|
|
3741
|
-
arr.push(point);
|
|
3742
|
-
};
|
|
3743
|
-
/**
|
|
3744
|
-
* Executes a combined regex over the content for combinable rules and records
|
|
3745
|
-
* any resulting split points into `splitPointsByRule`.
|
|
3746
|
-
*
|
|
3747
|
-
* This function mutates `splitPointsByRule` in place and throws if the regex
|
|
3748
|
-
* iteration guard is exceeded.
|
|
3749
|
-
*
|
|
3750
|
-
* @param matchContent - Concatenated content being segmented
|
|
3751
|
-
* @param combinableRules - Rules that can be combined into a single alternation
|
|
3752
|
-
* @param ruleRegexes - Compiled regex metadata aligned with `combinableRules`
|
|
3753
|
-
* @param pageMap - Page boundary mapping utilities for the content
|
|
3754
|
-
* @param passesPageStartGuard - Callback that decides whether a match is allowed
|
|
3755
|
-
* @param splitPointsByRule - Mutable map collecting split points by rule index
|
|
3756
|
-
* @param logger - Optional logger for iteration diagnostics
|
|
3757
|
-
* @returns Nothing; results are written into `splitPointsByRule`
|
|
3758
|
-
*/
|
|
3759
|
-
const processCombinedMatches = (matchContent, combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, logger) => {
|
|
3760
|
-
assertCombinedRuleAlignment(combinableRules, ruleRegexes);
|
|
3761
|
-
const combinedSource = ruleRegexes.map((r) => r.source).join("|");
|
|
3762
|
-
const combinedRegex = new RegExp(combinedSource, "gm");
|
|
3763
|
-
logger?.debug?.("[segmenter] combined regex built", {
|
|
3764
|
-
combinableRuleCount: combinableRules.length,
|
|
3765
|
-
combinedSourceLength: combinedSource.length
|
|
3766
|
-
});
|
|
3767
|
-
let m = combinedRegex.exec(matchContent);
|
|
3768
|
-
let iterations = 0;
|
|
3769
|
-
while (m !== null) {
|
|
3770
|
-
if (++iterations > MAX_REGEX_ITERATIONS) throw new Error(`[segmenter] Possible infinite loop: exceeded ${MAX_REGEX_ITERATIONS} iterations at position ${m.index}.`);
|
|
3771
|
-
if (iterations % 1e4 === 0) logger?.warn?.("[segmenter] high iteration count", {
|
|
3772
|
-
iterations,
|
|
3773
|
-
position: m.index
|
|
3774
|
-
});
|
|
3775
|
-
processCombinedMatch(combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, m);
|
|
3776
|
-
if (m[0].length === 0) combinedRegex.lastIndex++;
|
|
3777
|
-
m = combinedRegex.exec(matchContent);
|
|
3778
|
-
}
|
|
3779
|
-
};
|
|
3780
|
-
const assertCombinedRuleAlignment = (combinableRules, ruleRegexes) => {
|
|
3781
|
-
if (combinableRules.length !== ruleRegexes.length) throw new Error(`processCombinedMatches: combinableRules/ruleRegexes length mismatch (${combinableRules.length} !== ${ruleRegexes.length})`);
|
|
3782
|
-
for (let i = 0; i < combinableRules.length; i++) if (!ruleRegexes[i].source.includes(`(?<${combinableRules[i].prefix}>`)) throw new Error(`processCombinedMatches: regex alignment mismatch for prefix "${combinableRules[i].prefix}" at index ${i}`);
|
|
3783
|
-
};
|
|
3784
|
-
const processCombinedMatch = (combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, match) => {
|
|
3785
|
-
const matchedIndex = combinableRules.findIndex(({ prefix }) => match.groups?.[prefix] !== void 0);
|
|
3786
|
-
if (matchedIndex === -1) return;
|
|
3787
|
-
const { rule, index: originalIndex } = combinableRules[matchedIndex];
|
|
3788
|
-
if (!passesRuleConstraints(rule, pageMap.getId(match.index)) || !passesPageStartGuard(rule, originalIndex, match.index)) return;
|
|
3789
|
-
addSplitPoint(splitPointsByRule, originalIndex, createSplitPointFromMatch(match, rule, ruleRegexes[matchedIndex]));
|
|
3790
|
-
};
|
|
3791
|
-
/**
|
|
3792
|
-
* Builds compiled regex metadata for each combinable rule while preserving the
|
|
3793
|
-
* prefix used to identify the matching branch inside a combined alternation.
|
|
3794
|
-
*
|
|
3795
|
-
* @param combinableRules - Rules eligible for combined-regex processing
|
|
3796
|
-
* @returns Rule regex metadata aligned with the input order
|
|
3797
|
-
*/
|
|
3798
|
-
const buildRuleRegexes = (combinableRules) => combinableRules.map(({ rule, prefix }) => {
|
|
3799
|
-
const built = buildRuleRegex(rule, prefix);
|
|
3800
|
-
return {
|
|
3801
|
-
...built,
|
|
3802
|
-
prefix,
|
|
3803
|
-
source: `(?<${prefix}>${built.regex.source})`
|
|
3804
|
-
};
|
|
3805
|
-
});
|
|
3806
|
-
/**
|
|
3807
|
-
* Processes a standalone rule by matching it independently and appending its
|
|
3808
|
-
* resulting split points into `splitPointsByRule`.
|
|
3809
|
-
*
|
|
3810
|
-
* @param rule - The standalone split rule to evaluate
|
|
3811
|
-
* @param ruleIndex - Original rule index in the caller's rules array
|
|
3812
|
-
* @param matchContent - Concatenated content being segmented
|
|
3813
|
-
* @param pageMap - Page boundary mapping utilities for the content
|
|
3814
|
-
* @param passesPageStartGuard - Callback that decides whether a match is allowed
|
|
3815
|
-
* @param splitPointsByRule - Mutable map collecting split points by rule index
|
|
3816
|
-
* @returns Nothing; results are written into `splitPointsByRule`
|
|
3817
|
-
*/
|
|
3818
|
-
const processStandaloneRule = (rule, ruleIndex, matchContent, pageMap, passesPageStartGuard, splitPointsByRule) => {
|
|
3819
|
-
const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
|
|
3820
|
-
const points = filterByConstraints(findMatchesInContent(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).filter((m) => passesPageStartGuard(rule, ruleIndex, m.start)).map((m) => {
|
|
3821
|
-
const isLSA = usesLineStartsAfter && m.captured !== void 0;
|
|
3822
|
-
return {
|
|
3823
|
-
capturedContent: isLSA ? void 0 : m.captured,
|
|
3824
|
-
contentStartOffset: isLSA ? m.end - m.captured.length - m.start : void 0,
|
|
3825
|
-
index: (rule.split ?? "at") === "at" ? m.start : m.end,
|
|
3826
|
-
meta: rule.meta,
|
|
3827
|
-
namedCaptures: m.namedCaptures,
|
|
3828
|
-
wordIndex: m.wordIndex
|
|
3829
|
-
};
|
|
3830
|
-
});
|
|
3831
|
-
const arr = splitPointsByRule.get(ruleIndex);
|
|
3832
|
-
if (!arr) splitPointsByRule.set(ruleIndex, points);
|
|
3833
|
-
else arr.push(...points);
|
|
3834
|
-
};
|
|
3835
|
-
const findMatchesInContent = (content, regex, usesCapture, captureNames) => {
|
|
3836
|
-
const matches = [];
|
|
3837
|
-
let m = regex.exec(content);
|
|
3838
|
-
while (m !== null) {
|
|
3839
|
-
const wordIndex = extractDebugIndex(m.groups, "_r");
|
|
3840
|
-
matches.push({
|
|
3841
|
-
captured: usesCapture ? getLastPositionalCapture(m) : void 0,
|
|
3842
|
-
end: m.index + m[0].length,
|
|
3843
|
-
namedCaptures: extractNamedCaptures(m.groups, captureNames),
|
|
3844
|
-
start: m.index,
|
|
3845
|
-
wordIndex
|
|
3846
|
-
});
|
|
3847
|
-
if (m[0].length === 0) regex.lastIndex++;
|
|
3848
|
-
m = regex.exec(content);
|
|
3849
|
-
}
|
|
3850
|
-
return matches;
|
|
3851
|
-
};
|
|
3852
|
-
/**
|
|
3853
|
-
* Applies per-rule occurrence filtering and optional debug metadata patches to
|
|
3854
|
-
* the collected split points.
|
|
3855
|
-
*
|
|
3856
|
-
* @param rules - Full rule list in original order
|
|
3857
|
-
* @param splitPointsByRule - Split points grouped by originating rule index
|
|
3858
|
-
* @param debugMetaKey - Optional metadata key used for debug provenance patches
|
|
3859
|
-
* @returns Flattened split points after occurrence filtering and debug merging
|
|
3860
|
-
*/
|
|
3861
|
-
const applyOccurrenceFilter = (rules, splitPointsByRule, debugMetaKey) => {
|
|
3862
|
-
const result = [];
|
|
3863
|
-
rules.forEach((rule, index) => {
|
|
3864
|
-
const points = splitPointsByRule.get(index);
|
|
3865
|
-
if (!points?.length) return;
|
|
3866
|
-
const filtered = rule.occurrence === "first" ? [points[0]] : rule.occurrence === "last" ? [points.at(-1)] : points;
|
|
3867
|
-
result.push(...filtered.map((p) => {
|
|
3868
|
-
const debugPatch = debugMetaKey ? buildRuleDebugPatch(index, rule, p.wordIndex) : null;
|
|
3869
|
-
return {
|
|
3870
|
-
...p,
|
|
3871
|
-
meta: debugMetaKey ? mergeDebugIntoMeta(p.meta, debugMetaKey, debugPatch) : p.meta,
|
|
3872
|
-
ruleIndex: index
|
|
3873
|
-
};
|
|
3874
|
-
}));
|
|
3875
|
-
});
|
|
3876
|
-
return result;
|
|
3877
|
-
};
|
|
3878
|
-
//#endregion
|
|
3879
|
-
//#region src/segmentation/segmenter.ts
|
|
3880
|
-
/**
|
|
3881
|
-
* Builds a concatenated content string and page mapping from input pages.
|
|
3882
|
-
*
|
|
3883
|
-
* Pages are joined with newline characters, and a page map is created to
|
|
3884
|
-
* track which page each offset belongs to. This allows pattern matching
|
|
3885
|
-
* across page boundaries while preserving page reference information.
|
|
3886
|
-
*
|
|
3887
|
-
* @param pages - Array of input pages with id and content
|
|
3888
|
-
* @returns Concatenated content string and page mapping utilities
|
|
3889
|
-
*
|
|
3890
|
-
* @example
|
|
3891
|
-
* const pages = [
|
|
3892
|
-
* { id: 1, content: 'Page 1 text' },
|
|
3893
|
-
* { id: 2, content: 'Page 2 text' }
|
|
3894
|
-
* ];
|
|
3895
|
-
* const { content, pageMap } = buildPageMap(pages);
|
|
3896
|
-
* // content = 'Page 1 text\nPage 2 text'
|
|
3897
|
-
* // pageMap.getId(0) = 1
|
|
3898
|
-
* // pageMap.getId(12) = 2
|
|
3899
|
-
*/
|
|
3900
|
-
const buildPageMap = (pages) => {
|
|
3901
|
-
const boundaries = [];
|
|
3902
|
-
const pageBreaks = [];
|
|
3903
|
-
let offset = 0;
|
|
3904
|
-
const parts = [];
|
|
3905
|
-
for (let i = 0; i < pages.length; i++) {
|
|
3906
|
-
const normalized = normalizeLineEndings(pages[i].content);
|
|
3907
|
-
boundaries.push({
|
|
3908
|
-
end: offset + normalized.length,
|
|
3909
|
-
id: pages[i].id,
|
|
3910
|
-
start: offset
|
|
3911
|
-
});
|
|
3912
|
-
parts.push(normalized);
|
|
3913
|
-
if (i < pages.length - 1) {
|
|
3914
|
-
pageBreaks.push(offset + normalized.length);
|
|
3915
|
-
offset += normalized.length + 1;
|
|
3916
|
-
} else offset += normalized.length;
|
|
3917
|
-
}
|
|
3918
|
-
const findBoundary = (off) => {
|
|
3919
|
-
let lo = 0, hi = boundaries.length - 1;
|
|
3920
|
-
while (lo <= hi) {
|
|
3921
|
-
const mid = lo + hi >>> 1;
|
|
3922
|
-
const b = boundaries[mid];
|
|
3923
|
-
if (off < b.start) hi = mid - 1;
|
|
3924
|
-
else if (off > b.end) lo = mid + 1;
|
|
3925
|
-
else return b;
|
|
3926
|
-
}
|
|
3927
|
-
return boundaries.at(-1);
|
|
3928
|
-
};
|
|
3929
|
-
return {
|
|
3930
|
-
content: parts.join("\n"),
|
|
3931
|
-
normalizedPages: parts,
|
|
3932
|
-
pageMap: {
|
|
3933
|
-
boundaries,
|
|
3934
|
-
getId: (off) => findBoundary(off)?.id ?? 0,
|
|
3935
|
-
pageBreaks,
|
|
3936
|
-
pageIds: boundaries.map((b) => b.id)
|
|
3937
|
-
}
|
|
3938
|
-
};
|
|
3939
|
-
};
|
|
3940
|
-
/**
|
|
3941
|
-
* Deduplicate split points by index, preferring ones with more information.
|
|
3942
|
-
*
|
|
3943
|
-
* Preference rules (when same index):
|
|
3944
|
-
* - Prefer a split with `contentStartOffset` (needed for `lineStartsAfter` marker stripping)
|
|
3945
|
-
* - Otherwise prefer a split with `meta` over one without
|
|
3946
|
-
*/
|
|
3947
|
-
const dedupeSplitPoints = (splitPoints) => {
|
|
3948
|
-
const byIndex = /* @__PURE__ */ new Map();
|
|
3949
|
-
for (const p of splitPoints) {
|
|
3950
|
-
const existing = byIndex.get(p.index);
|
|
3951
|
-
if (!existing) {
|
|
3952
|
-
byIndex.set(p.index, p);
|
|
3953
|
-
continue;
|
|
3954
|
-
}
|
|
3955
|
-
byIndex.set(p.index, mergeSplitPoints(existing, p));
|
|
3956
|
-
}
|
|
3957
|
-
return [...byIndex.values()].sort((a, b) => a.index - b.index);
|
|
3958
|
-
};
|
|
3959
|
-
const prefersIncomingSplitPoint = (existing, incoming) => incoming.contentStartOffset !== void 0 && existing.contentStartOffset === void 0 || incoming.meta !== void 0 && existing.meta === void 0;
|
|
3960
|
-
const mergeRecord = (existing, incoming) => existing || incoming ? {
|
|
3961
|
-
...existing ?? {},
|
|
3962
|
-
...incoming ?? {}
|
|
3963
|
-
} : void 0;
|
|
3964
|
-
const mergeSplitPoints = (existing, incoming) => {
|
|
3965
|
-
const preferred = prefersIncomingSplitPoint(existing, incoming) ? incoming : existing;
|
|
3966
|
-
const fallback = preferred === incoming ? existing : incoming;
|
|
3967
|
-
return {
|
|
3968
|
-
...fallback,
|
|
3969
|
-
...preferred,
|
|
3970
|
-
contentStartOffset: preferred.contentStartOffset ?? fallback.contentStartOffset,
|
|
3971
|
-
meta: mergeRecord(existing.meta, incoming.meta),
|
|
3972
|
-
namedCaptures: mergeRecord(existing.namedCaptures, incoming.namedCaptures)
|
|
3973
|
-
};
|
|
3974
|
-
};
|
|
3975
|
-
/**
|
|
3976
|
-
* If no structural rules produced segments, create a single segment spanning all pages.
|
|
3977
|
-
* This allows breakpoint processing to still run.
|
|
3978
|
-
*/
|
|
3979
|
-
const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) => {
|
|
3980
|
-
if (segments.length > 0 || pages.length === 0) return segments;
|
|
3981
|
-
const firstPage = pages[0];
|
|
3982
|
-
const lastPage = pages.at(-1);
|
|
3983
|
-
const joiner = pageJoiner === "newline" ? "\n" : " ";
|
|
3984
|
-
const allContent = normalizedContent.join(joiner).replace(/\s+$/u, "");
|
|
3985
|
-
if (!allContent.trim()) return segments;
|
|
3986
|
-
const initialSeg = {
|
|
3987
|
-
content: allContent,
|
|
3988
|
-
from: firstPage.id
|
|
3989
|
-
};
|
|
3990
|
-
if (lastPage.id !== firstPage.id) initialSeg.to = lastPage.id;
|
|
3991
|
-
return [initialSeg];
|
|
3992
|
-
};
|
|
3993
|
-
const collectSplitPointsFromRules = (rules, matchContent, pageMap, debugMetaKey, logger) => {
|
|
3994
|
-
logger?.debug?.("[segmenter] collecting split points from rules", {
|
|
3995
|
-
contentLength: matchContent.length,
|
|
3996
|
-
ruleCount: rules.length
|
|
3997
|
-
});
|
|
3998
|
-
const passesPageStartGuard = createPageStartGuardChecker(matchContent, pageMap);
|
|
3999
|
-
const { combinableRules, fastFuzzyRules, standaloneRules } = partitionRulesForMatching(rules);
|
|
4000
|
-
logger?.debug?.("[segmenter] rules partitioned", {
|
|
4001
|
-
combinableCount: combinableRules.length,
|
|
4002
|
-
fastFuzzyCount: fastFuzzyRules.length,
|
|
4003
|
-
standaloneCount: standaloneRules.length
|
|
4004
|
-
});
|
|
4005
|
-
const splitPointsByRule = collectFastFuzzySplitPoints(matchContent, pageMap, fastFuzzyRules, passesPageStartGuard);
|
|
4006
|
-
if (combinableRules.length > 0) processCombinedMatches(matchContent, combinableRules, buildRuleRegexes(combinableRules), pageMap, passesPageStartGuard, splitPointsByRule, logger);
|
|
4007
|
-
for (const { rule, index } of standaloneRules) processStandaloneRule(rule, index, matchContent, pageMap, passesPageStartGuard, splitPointsByRule);
|
|
4008
|
-
return applyOccurrenceFilter(rules, splitPointsByRule, debugMetaKey);
|
|
4009
|
-
};
|
|
4010
|
-
/**
|
|
4011
|
-
* Finds page breaks within a given offset range using binary search.
|
|
4012
|
-
* O(log n + k) where n = total breaks, k = breaks in range.
|
|
4013
|
-
*
|
|
4014
|
-
* @param startOffset - Start of range (inclusive)
|
|
4015
|
-
* @param endOffset - End of range (exclusive)
|
|
4016
|
-
* @param sortedBreaks - Sorted array of page break offsets
|
|
4017
|
-
* @returns Array of break offsets relative to startOffset
|
|
4018
|
-
*/
|
|
4019
|
-
const findBreaksInRange = (startOffset, endOffset, sortedBreaks) => {
|
|
4020
|
-
if (sortedBreaks.length === 0) return [];
|
|
4021
|
-
let lo = 0, hi = sortedBreaks.length;
|
|
4022
|
-
while (lo < hi) {
|
|
4023
|
-
const mid = lo + hi >>> 1;
|
|
4024
|
-
if (sortedBreaks[mid] < startOffset) lo = mid + 1;
|
|
4025
|
-
else hi = mid;
|
|
4026
|
-
}
|
|
4027
|
-
const result = [];
|
|
4028
|
-
for (let i = lo; i < sortedBreaks.length && sortedBreaks[i] < endOffset; i++) result.push(sortedBreaks[i] - startOffset);
|
|
4029
|
-
return result;
|
|
4030
|
-
};
|
|
4031
|
-
/**
|
|
4032
|
-
* Converts page-break newlines to spaces in segment content.
|
|
4033
|
-
*
|
|
4034
|
-
* When a segment spans multiple pages, the newline characters that were
|
|
4035
|
-
* inserted as page separators during concatenation are converted to spaces
|
|
4036
|
-
* for more natural reading.
|
|
4037
|
-
*
|
|
4038
|
-
* Uses binary search for O(log n + k) lookup instead of O(n) iteration.
|
|
4039
|
-
*
|
|
4040
|
-
* @param content - Segment content string
|
|
4041
|
-
* @param startOffset - Starting offset of this content in concatenated string
|
|
4042
|
-
* @param pageBreaks - Sorted array of page break offsets
|
|
4043
|
-
* @param pageJoiner - How to represent page boundaries in output (`space` vs `newline`)
|
|
4044
|
-
* @returns Content with page-break newlines converted to spaces (or left as-is for `newline`)
|
|
4045
|
-
*/
|
|
4046
|
-
const convertPageBreaks = (content, startOffset, pageBreaks, pageJoiner) => {
|
|
4047
|
-
if (!content?.includes("\n")) return content;
|
|
4048
|
-
if (pageJoiner === "newline") return content;
|
|
4049
|
-
const breaksInRange = findBreaksInRange(startOffset, startOffset + content.length, pageBreaks);
|
|
4050
|
-
if (breaksInRange.length === 0) return content;
|
|
4051
|
-
const breakSet = new Set(breaksInRange);
|
|
4052
|
-
return content.replace(/\n/g, (match, offset) => breakSet.has(offset) ? " " : match);
|
|
4053
|
-
};
|
|
4054
|
-
/**
|
|
4055
|
-
* Segments pages of content based on pattern-matching rules.
|
|
4056
|
-
*
|
|
4057
|
-
* This is the main entry point for the segmentation engine. It takes an array
|
|
4058
|
-
* of pages and applies the provided rules to identify split points, producing
|
|
4059
|
-
* an array of segments with content, page references, and metadata.
|
|
4060
|
-
*
|
|
4061
|
-
* @param pages - Array of pages with id and content
|
|
4062
|
-
* @param options - Segmentation options including splitting rules
|
|
4063
|
-
* @returns Array of segments with content, from/to page references, and optional metadata
|
|
4064
|
-
*
|
|
4065
|
-
* @example
|
|
4066
|
-
* // Split markdown by headers
|
|
4067
|
-
* const segments = segmentPages(pages, {
|
|
4068
|
-
* rules: [
|
|
4069
|
-
* { lineStartsWith: ['## '], split: 'at', meta: { type: 'chapter' } }
|
|
4070
|
-
* ]
|
|
4071
|
-
* });
|
|
4072
|
-
*
|
|
4073
|
-
* @example
|
|
4074
|
-
* // Split Arabic hadith text with number extraction
|
|
4075
|
-
* const segments = segmentPages(pages, {
|
|
4076
|
-
* rules: [
|
|
4077
|
-
* {
|
|
4078
|
-
* lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '],
|
|
4079
|
-
* split: 'at',
|
|
4080
|
-
* fuzzy: true,
|
|
4081
|
-
* meta: { type: 'hadith' }
|
|
4082
|
-
* }
|
|
4083
|
-
* ]
|
|
4084
|
-
* });
|
|
4085
|
-
*
|
|
4086
|
-
* @example
|
|
4087
|
-
* // Multiple rules with page constraints
|
|
4088
|
-
* const segments = segmentPages(pages, {
|
|
4089
|
-
* rules: [
|
|
4090
|
-
* { lineStartsWith: ['{{kitab}}'], split: 'at', meta: { type: 'book' } },
|
|
4091
|
-
* { lineStartsWith: ['{{bab}}'], split: 'at', min: 10, meta: { type: 'chapter' } },
|
|
4092
|
-
* { regex: '^[٠-٩]+ - ', split: 'at', meta: { type: 'hadith' } }
|
|
4093
|
-
* ]
|
|
4094
|
-
* });
|
|
4095
|
-
*/
|
|
4096
|
-
const segmentPages = (pages, options) => {
|
|
4097
|
-
const { rules = [], breakpoints = [], prefer = "longer", pageJoiner = "space", logger, maxContentLength, preprocess } = options;
|
|
4098
|
-
if (maxContentLength && maxContentLength < 50) throw new Error(`maxContentLength must be at least 50 characters.`);
|
|
4099
|
-
const maxPages = options.maxPages ?? Number.MAX_SAFE_INTEGER;
|
|
4100
|
-
const hasLimits = options.maxPages !== void 0 || maxContentLength !== void 0;
|
|
4101
|
-
const debug = resolveDebugConfig(options.debug);
|
|
4102
|
-
const debugMetaKey = debug?.includeRule ? debug.metaKey : void 0;
|
|
4103
|
-
logger?.info?.("[segmenter] starting segmentation", {
|
|
4104
|
-
breakpointCount: breakpoints.length,
|
|
4105
|
-
maxContentLength,
|
|
4106
|
-
maxPages,
|
|
4107
|
-
pageCount: pages.length,
|
|
4108
|
-
prefer,
|
|
4109
|
-
preprocessCount: preprocess?.length ?? 0,
|
|
4110
|
-
ruleCount: rules.length
|
|
4111
|
-
});
|
|
4112
|
-
const preprocessedPages = preprocess && preprocess.length > 0 ? pages.map((page) => ({
|
|
4113
|
-
...page,
|
|
4114
|
-
content: applyPreprocessToPage(page.content, page.id, preprocess)
|
|
4115
|
-
})) : pages;
|
|
4116
|
-
const { content: matchContent, normalizedPages: normalizedContent, pageMap } = buildPageMap(preprocessedPages);
|
|
4117
|
-
logger?.debug?.("[segmenter] content built", {
|
|
4118
|
-
pageIds: pageMap.pageIds,
|
|
4119
|
-
totalContentLength: matchContent.length
|
|
4120
|
-
});
|
|
4121
|
-
const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap, debugMetaKey, logger);
|
|
4122
|
-
const unique = dedupeSplitPoints(splitPoints);
|
|
4123
|
-
logger?.debug?.("[segmenter] split points collected", {
|
|
4124
|
-
rawSplitPoints: splitPoints.length,
|
|
4125
|
-
uniqueSplitPoints: unique.length
|
|
4126
|
-
});
|
|
4127
|
-
let segments = buildSegments(unique, matchContent, pageMap, rules, pageJoiner);
|
|
4128
|
-
logger?.debug?.("[segmenter] structural segments built", { segmentCount: segments.length });
|
|
4129
|
-
segments = ensureFallbackSegment(segments, preprocessedPages, normalizedContent, pageJoiner);
|
|
4130
|
-
if (hasLimits) {
|
|
4131
|
-
logger?.debug?.("[segmenter] applying breakpoints to oversized segments");
|
|
4132
|
-
const result = applyBreakpoints(segments, preprocessedPages, normalizedContent, maxPages, breakpoints, prefer, (p) => processPattern(p, false).pattern, logger, pageJoiner, debug?.includeBreakpoint ? debug.metaKey : void 0, maxContentLength, processBreakpointPattern);
|
|
4133
|
-
logger?.info?.("[segmenter] segmentation complete (with breakpoints)", { finalSegmentCount: result.length });
|
|
4134
|
-
return result;
|
|
4135
|
-
}
|
|
4136
|
-
logger?.info?.("[segmenter] segmentation complete (structural only)", { finalSegmentCount: segments.length });
|
|
4137
|
-
return segments;
|
|
4138
|
-
};
|
|
4139
|
-
/**
|
|
4140
|
-
* Creates segment objects from split points.
|
|
4141
|
-
*
|
|
4142
|
-
* Handles segment creation including:
|
|
4143
|
-
* - Content extraction (with captured content for `lineStartsAfter`)
|
|
4144
|
-
* - Page break conversion to spaces
|
|
4145
|
-
* - From/to page reference calculation
|
|
4146
|
-
* - Metadata merging (static + named captures)
|
|
4147
|
-
*
|
|
4148
|
-
* @param splitPoints - Sorted, unique split points
|
|
4149
|
-
* @param content - Full concatenated content string
|
|
4150
|
-
* @param pageMap - Page mapping utilities
|
|
4151
|
-
* @param rules - Original rules (for constraint checking on first segment)
|
|
4152
|
-
* @returns Array of segment objects
|
|
4153
|
-
*/
|
|
4154
|
-
const buildSegments = (splitPoints, content, pageMap, rules, pageJoiner) => {
|
|
4155
|
-
const getActualStart = (start, contentStartOffset) => start + (contentStartOffset ?? 0);
|
|
4156
|
-
const trimSegmentText = (sliced, capturedContent, contentStartOffset) => capturedContent?.trim() ?? (contentStartOffset ? sliced.trim() : sliced.replace(/[\s\n]+$/, ""));
|
|
4157
|
-
const getAdjustedStart = (actualStart, sliced, contentStartOffset) => actualStart + (contentStartOffset ? sliced.length - sliced.trimStart().length : 0);
|
|
4158
|
-
const applyMeta = (meta, namedCaptures) => meta || namedCaptures ? {
|
|
4159
|
-
...meta,
|
|
4160
|
-
...namedCaptures
|
|
4161
|
-
} : void 0;
|
|
4162
|
-
/**
|
|
4163
|
-
* Creates a single segment from a content range.
|
|
4164
|
-
*/
|
|
4165
|
-
const createSegment = (start, end, meta, capturedContent, namedCaptures, contentStartOffset) => {
|
|
4166
|
-
const actualStart = getActualStart(start, contentStartOffset);
|
|
4167
|
-
const sliced = content.slice(actualStart, end);
|
|
4168
|
-
let text = trimSegmentText(sliced, capturedContent, contentStartOffset);
|
|
4169
|
-
if (!text) return null;
|
|
4170
|
-
if (!capturedContent) text = convertPageBreaks(text, actualStart, pageMap.pageBreaks, pageJoiner);
|
|
4171
|
-
const adjustedStart = getAdjustedStart(actualStart, sliced, contentStartOffset);
|
|
4172
|
-
const from = pageMap.getId(adjustedStart);
|
|
4173
|
-
const to = capturedContent ? pageMap.getId(end - 1) : pageMap.getId(adjustedStart + text.length - 1);
|
|
4174
|
-
const seg = {
|
|
4175
|
-
content: text,
|
|
4176
|
-
from
|
|
4177
|
-
};
|
|
4178
|
-
if (to !== from) seg.to = to;
|
|
4179
|
-
const mergedMeta = applyMeta(meta, namedCaptures);
|
|
4180
|
-
if (mergedMeta) seg.meta = mergedMeta;
|
|
4181
|
-
return seg;
|
|
4182
|
-
};
|
|
4183
|
-
/**
|
|
4184
|
-
* Creates segments from an array of split points.
|
|
4185
|
-
*/
|
|
4186
|
-
const createSegmentsFromSplitPoints = () => {
|
|
4187
|
-
const result = [];
|
|
4188
|
-
for (let i = 0; i < splitPoints.length; i++) {
|
|
4189
|
-
const sp = splitPoints[i];
|
|
4190
|
-
const end = splitPoints[i + 1]?.index ?? content.length;
|
|
4191
|
-
const s = createSegment(sp.index, end, sp.meta, sp.capturedContent, sp.namedCaptures, sp.contentStartOffset);
|
|
4192
|
-
if (s) result.push(s);
|
|
4193
|
-
}
|
|
4194
|
-
return result;
|
|
4195
|
-
};
|
|
4196
|
-
const segments = [];
|
|
4197
|
-
if (!splitPoints.length) {
|
|
4198
|
-
if (anyRuleAllowsId(rules, pageMap.getId(0))) {
|
|
4199
|
-
const s = createSegment(0, content.length);
|
|
4200
|
-
if (s) segments.push(s);
|
|
4201
|
-
}
|
|
4202
|
-
return segments;
|
|
4203
|
-
}
|
|
4204
|
-
if (splitPoints[0].index > 0) {
|
|
4205
|
-
if (anyRuleAllowsId(rules, pageMap.getId(0))) {
|
|
4206
|
-
const s = createSegment(0, splitPoints[0].index);
|
|
4207
|
-
if (s) segments.push(s);
|
|
4208
|
-
}
|
|
4209
|
-
}
|
|
4210
|
-
return [...segments, ...createSegmentsFromSplitPoints()];
|
|
4211
|
-
};
|
|
4212
|
-
//#endregion
|
|
4213
|
-
//#region src/validation/validate-segments.ts
|
|
4214
|
-
/**
|
|
4215
|
-
* Creates a short preview string of text content for error reporting.
|
|
4216
|
-
* Truncates content exceeding PREVIEW_LIMIT.
|
|
4217
|
-
*/
|
|
4218
|
-
const buildPreview = (text) => {
|
|
4219
|
-
const normalized = text.replace(/\s+/g, " ").trim();
|
|
4220
|
-
if (normalized.length <= 140) return normalized;
|
|
4221
|
-
return `${normalized.slice(0, 140)}...`;
|
|
4222
|
-
};
|
|
4223
|
-
/**
|
|
4224
|
-
* Creates a lightweight snapshot of a segment for inclusion in validation checks.
|
|
4225
|
-
*/
|
|
4226
|
-
const buildSegmentSnapshot = (segment) => ({
|
|
4227
|
-
contentPreview: buildPreview(segment.content),
|
|
4228
|
-
from: segment.from,
|
|
4229
|
-
to: segment.to
|
|
4230
|
-
});
|
|
4231
|
-
/**
|
|
4232
|
-
* Normalizes page content by applying preprocessing transforms and standardizing line endings.
|
|
4233
|
-
*/
|
|
4234
|
-
const normalizePages = (pages, options) => {
|
|
4235
|
-
const transforms = options.preprocess ?? [];
|
|
4236
|
-
return pages.map((page) => {
|
|
4237
|
-
return {
|
|
4238
|
-
content: normalizeLineEndings(transforms.length ? applyPreprocessToPage(page.content, page.id, transforms) : page.content),
|
|
4239
|
-
id: page.id
|
|
4240
|
-
};
|
|
4241
|
-
});
|
|
4242
|
-
};
|
|
4243
|
-
/**
|
|
4244
|
-
* Joins all page content into a single string with boundary tracking.
|
|
4245
|
-
* Returns the joined string and a list of boundary mappings (start/end indices for each page).
|
|
4246
|
-
*/
|
|
4247
|
-
const buildJoinedContent = (pages, joiner) => {
|
|
4248
|
-
const boundaries = [];
|
|
4249
|
-
const joined = pages.map((p) => p.content).join(joiner);
|
|
4250
|
-
let offset = 0;
|
|
4251
|
-
for (let i = 0; i < pages.length; i++) {
|
|
4252
|
-
const content = pages[i].content;
|
|
4253
|
-
const start = offset;
|
|
4254
|
-
const end = start + content.length;
|
|
4255
|
-
boundaries.push({
|
|
4256
|
-
end,
|
|
4257
|
-
id: pages[i].id,
|
|
4258
|
-
start
|
|
4259
|
-
});
|
|
4260
|
-
offset += content.length + (i < pages.length - 1 ? joiner.length : 0);
|
|
4261
|
-
}
|
|
4262
|
-
return {
|
|
4263
|
-
boundaries,
|
|
4264
|
-
joined
|
|
4265
|
-
};
|
|
4266
|
-
};
|
|
4267
|
-
/**
|
|
4268
|
-
* Binary search to find which page ID corresponds to a character offset in the joined content.
|
|
4269
|
-
* Returns undefined if the offset falls within a joiner gap or outside bounds.
|
|
4270
|
-
*/
|
|
4271
|
-
const findBoundaryIdForOffset = (offset, boundaries) => {
|
|
4272
|
-
let lo = 0;
|
|
4273
|
-
let hi = boundaries.length - 1;
|
|
4274
|
-
while (lo <= hi) {
|
|
4275
|
-
const mid = lo + hi >>> 1;
|
|
4276
|
-
const boundary = boundaries[mid];
|
|
4277
|
-
if (offset < boundary.start) hi = mid - 1;
|
|
4278
|
-
else if (offset > boundary.end) lo = mid + 1;
|
|
4279
|
-
else return boundary.id;
|
|
4280
|
-
}
|
|
4281
|
-
if (boundaries.length === 0) return;
|
|
4282
|
-
const last = boundaries.at(-1);
|
|
4283
|
-
return offset > last.end ? last.id : void 0;
|
|
4284
|
-
};
|
|
4285
|
-
/**
|
|
4286
|
-
* Helper to construct a standardized validation issue object.
|
|
4287
|
-
*/
|
|
4288
|
-
const createIssue = (type, segment, segmentIndex, overrides = {}, pageMap) => {
|
|
4289
|
-
const segmentSnapshot = buildSegmentSnapshot(segment);
|
|
4290
|
-
const page = pageMap?.get(segment.from);
|
|
4291
|
-
const matchIndex = overrides.matchIndex;
|
|
4292
|
-
const { matchIndex: _ignored, ...restOverrides } = overrides;
|
|
4293
|
-
const base = {
|
|
4294
|
-
actual: {
|
|
4295
|
-
from: segment.from,
|
|
4296
|
-
to: segment.to
|
|
4297
|
-
},
|
|
4298
|
-
segment: segmentSnapshot,
|
|
4299
|
-
segmentIndex,
|
|
4300
|
-
...restOverrides
|
|
4301
|
-
};
|
|
4302
|
-
switch (type) {
|
|
4303
|
-
case "page_not_found": return {
|
|
4304
|
-
...base,
|
|
4305
|
-
evidence: overrides.evidence ?? `Segment.from=${segment.from} does not exist in input pages.`,
|
|
4306
|
-
hint: "Check page IDs passed into segmentPages() and validateSegments().",
|
|
4307
|
-
severity: "error",
|
|
4308
|
-
type
|
|
4309
|
-
};
|
|
4310
|
-
case "content_not_found": return {
|
|
4311
|
-
...base,
|
|
4312
|
-
evidence: overrides.evidence ?? "Segment content not found in any page content.",
|
|
4313
|
-
hint: overrides.hint ?? "Check preprocessing options, joiner settings, or whitespace normalization.",
|
|
4314
|
-
pageContext: page ? {
|
|
4315
|
-
pageId: page.id,
|
|
4316
|
-
pagePreview: buildPreview(page.content)
|
|
4317
|
-
} : void 0,
|
|
4318
|
-
severity: "error",
|
|
4319
|
-
type
|
|
4320
|
-
};
|
|
4321
|
-
case "page_attribution_mismatch": {
|
|
4322
|
-
const matchedFromId = overrides.expected?.from ?? overrides.actual?.from ?? segment.from;
|
|
4323
|
-
const actualPage = pageMap?.get(matchedFromId);
|
|
4324
|
-
return {
|
|
4325
|
-
...base,
|
|
4326
|
-
evidence: overrides.evidence ?? `Content found in joined content at page ${matchedFromId}, but segment.from=${segment.from}.`,
|
|
4327
|
-
hint: overrides.hint ?? "Check duplicate content handling and boundary detection rules.",
|
|
4328
|
-
pageContext: actualPage ? {
|
|
4329
|
-
matchIndex: matchIndex ?? -1,
|
|
4330
|
-
pageId: actualPage.id,
|
|
4331
|
-
pagePreview: buildPreview(actualPage.content)
|
|
4332
|
-
} : void 0,
|
|
4333
|
-
severity: "error",
|
|
4334
|
-
type
|
|
4335
|
-
};
|
|
4336
|
-
}
|
|
4337
|
-
case "max_pages_violation": return {
|
|
4338
|
-
...base,
|
|
4339
|
-
evidence: overrides.evidence ?? `Segment spans pages ${segment.from}-${overrides.actual?.to}.`,
|
|
4340
|
-
hint: overrides.hint ?? "Check maxPages windowing in breakpoint-processor.ts and page constraints.",
|
|
4341
|
-
severity: "error",
|
|
4342
|
-
type
|
|
4343
|
-
};
|
|
4344
|
-
default: return {
|
|
4345
|
-
...base,
|
|
4346
|
-
severity: "error",
|
|
4347
|
-
type
|
|
4348
|
-
};
|
|
4349
|
-
}
|
|
4350
|
-
};
|
|
4351
|
-
/**
|
|
4352
|
-
* Finds all occurrences of a content string within the joined text.
|
|
4353
|
-
* Respects search limits to avoid performance cliffs on highly repetitive content.
|
|
4354
|
-
*/
|
|
4355
|
-
const findJoinedMatches = (content, joined, searchStart, searchEnd, limit = Infinity) => {
|
|
4356
|
-
const matches = [];
|
|
4357
|
-
if (!content || searchStart >= searchEnd) return matches;
|
|
4358
|
-
let idx = joined.indexOf(content, searchStart);
|
|
4359
|
-
let count = 0;
|
|
4360
|
-
while (idx >= 0 && idx < searchEnd && count < limit) {
|
|
4361
|
-
matches.push({
|
|
4362
|
-
end: idx + content.length - 1,
|
|
4363
|
-
start: idx
|
|
4364
|
-
});
|
|
4365
|
-
idx = joined.indexOf(content, idx + 1);
|
|
4366
|
-
if (idx >= searchEnd) break;
|
|
4367
|
-
count++;
|
|
4368
|
-
}
|
|
4369
|
-
return matches;
|
|
4370
|
-
};
|
|
4371
|
-
/**
|
|
4372
|
-
* Verifies that a matched segment falls within the allowed maxTerms/maxPages constraints.
|
|
4373
|
-
* Checks both implicit spans (calculated from match end) and explicit segment.to claims.
|
|
4374
|
-
*/
|
|
4375
|
-
const checkMaxPagesViolation = (segment, segmentIndex, maxPages, matchEnd, _expectedBoundaryEnd, boundaries) => {
|
|
4376
|
-
const actualToId = findBoundaryIdForOffset(matchEnd, boundaries);
|
|
4377
|
-
if (actualToId === void 0) return [];
|
|
4378
|
-
if (maxPages === 0) {
|
|
4379
|
-
if (actualToId !== segment.from) return [createIssue("max_pages_violation", segment, segmentIndex, {
|
|
4380
|
-
actual: {
|
|
4381
|
-
from: segment.from,
|
|
4382
|
-
to: actualToId
|
|
4383
|
-
},
|
|
4384
|
-
evidence: `Segment spans pages ${segment.from}-${actualToId} in joined content (maxPages=0).`,
|
|
4385
|
-
expected: {
|
|
4386
|
-
from: segment.from,
|
|
4387
|
-
to: segment.from
|
|
4388
|
-
}
|
|
4389
|
-
})];
|
|
4390
|
-
}
|
|
4391
|
-
if (segment.to !== void 0) {
|
|
4392
|
-
if (actualToId > segment.to) return [createIssue("max_pages_violation", segment, segmentIndex, {
|
|
4393
|
-
actual: {
|
|
4394
|
-
from: segment.from,
|
|
4395
|
-
to: actualToId
|
|
4396
|
-
},
|
|
4397
|
-
evidence: `Segment content ends on page ${actualToId} but segment.to is ${segment.to}.`,
|
|
4398
|
-
expected: {
|
|
4399
|
-
from: segment.from,
|
|
4400
|
-
to: segment.to
|
|
4401
|
-
}
|
|
4402
|
-
})];
|
|
4403
|
-
} else if (maxPages !== void 0) {
|
|
4404
|
-
const span = actualToId - segment.from;
|
|
4405
|
-
if (span > maxPages) return [createIssue("max_pages_violation", segment, segmentIndex, {
|
|
4406
|
-
actual: {
|
|
4407
|
-
from: segment.from,
|
|
4408
|
-
to: actualToId
|
|
4409
|
-
},
|
|
4410
|
-
evidence: `Segment spans ${span} pages (maxPages=${maxPages}).`,
|
|
4411
|
-
expected: {
|
|
4412
|
-
from: segment.from,
|
|
4413
|
-
to: segment.from + maxPages
|
|
4414
|
-
}
|
|
4415
|
-
})];
|
|
4416
|
-
}
|
|
4417
|
-
return [];
|
|
4418
|
-
};
|
|
4419
|
-
/**
|
|
4420
|
-
* Handles validation when content is not found in the expected boundary window.
|
|
4421
|
-
* Fallback strategy: search entire document if segment matches existing content elsewhere.
|
|
4422
|
-
*/
|
|
4423
|
-
const handleMissingBoundary = (segment, segmentIndex, joined, boundaries, pageMap) => {
|
|
4424
|
-
const matches = findJoinedMatches(segment.content, joined, 0, joined.length, 1);
|
|
4425
|
-
if (matches.length === 0) return [createIssue("content_not_found", segment, segmentIndex, { evidence: "Segment content not found in any page content." }, pageMap)];
|
|
4426
|
-
const match = matches[0];
|
|
4427
|
-
const actualFromId = findBoundaryIdForOffset(match.start, boundaries);
|
|
4428
|
-
const actualToId = findBoundaryIdForOffset(match.end, boundaries);
|
|
4429
|
-
return [createIssue("page_attribution_mismatch", segment, segmentIndex, {
|
|
4430
|
-
actual: {
|
|
4431
|
-
from: segment.from,
|
|
4432
|
-
to: segment.to
|
|
4433
|
-
},
|
|
4434
|
-
evidence: `Content found in joined content at page ${actualFromId}, but segment.from=${segment.from}.`,
|
|
4435
|
-
expected: {
|
|
4436
|
-
from: actualFromId,
|
|
4437
|
-
to: actualToId
|
|
4438
|
-
},
|
|
4439
|
-
matchIndex: match.start
|
|
4440
|
-
}, pageMap)];
|
|
4441
|
-
};
|
|
4442
|
-
/**
|
|
4443
|
-
* Performs a widened search when the direct check fails.
|
|
4444
|
-
* Includes a small buffer around the expected position, and optionally a full-document search for short segments.
|
|
4445
|
-
*/
|
|
4446
|
-
const handleFallbackSearch = (segment, segmentIndex, joined, searchStart, searchEnd, expectedBoundary, boundaries, pageMap, maxPages, validationOptions) => {
|
|
4447
|
-
const content = segment.content;
|
|
4448
|
-
const bufferSize = 1e3;
|
|
4449
|
-
const rawMatches = findJoinedMatches(content, joined, Math.max(0, searchStart - bufferSize), Math.min(joined.length, searchEnd + bufferSize), 5);
|
|
4450
|
-
if (rawMatches.length === 0) {
|
|
4451
|
-
const threshold = validationOptions?.fullSearchThreshold ?? 500;
|
|
4452
|
-
if (content.length < threshold) {
|
|
4453
|
-
const fullMatches = findJoinedMatches(content, joined, 0, joined.length, 50);
|
|
4454
|
-
const validMatch = fullMatches.find((m) => {
|
|
4455
|
-
return findBoundaryIdForOffset(m.start, boundaries) === segment.from;
|
|
4456
|
-
});
|
|
4457
|
-
if (validMatch) return checkMaxPagesViolation(segment, segmentIndex, maxPages, validMatch.end, expectedBoundary.end, boundaries);
|
|
4458
|
-
if (fullMatches.length > 0) {
|
|
4459
|
-
const match = fullMatches[0];
|
|
4460
|
-
const actualFromId = findBoundaryIdForOffset(match.start, boundaries);
|
|
4461
|
-
const actualToId = findBoundaryIdForOffset(match.end, boundaries);
|
|
4462
|
-
return [createIssue("page_attribution_mismatch", segment, segmentIndex, {
|
|
4463
|
-
actual: {
|
|
4464
|
-
from: segment.from,
|
|
4465
|
-
to: segment.to
|
|
4466
|
-
},
|
|
4467
|
-
evidence: `Content found in joined content at page ${actualFromId}, but segment.from=${segment.from}.`,
|
|
4468
|
-
expected: {
|
|
4469
|
-
from: actualFromId,
|
|
4470
|
-
to: actualToId
|
|
4471
|
-
},
|
|
4472
|
-
matchIndex: match.start
|
|
4473
|
-
}, pageMap)];
|
|
4474
|
-
}
|
|
4475
|
-
}
|
|
4476
|
-
return [createIssue("content_not_found", segment, segmentIndex, {
|
|
4477
|
-
evidence: `Segment content (${content.length} chars) not found in expected window.`,
|
|
4478
|
-
hint: "Check page boundary attribution in segmenter.ts."
|
|
4479
|
-
}, pageMap)];
|
|
4480
|
-
}
|
|
4481
|
-
const alignedMatches = rawMatches.filter((m) => m.start >= expectedBoundary.start && m.start <= expectedBoundary.end);
|
|
4482
|
-
if (alignedMatches.length > 0) {
|
|
4483
|
-
const primary = alignedMatches[0];
|
|
4484
|
-
return checkMaxPagesViolation(segment, segmentIndex, maxPages, primary.end, expectedBoundary.end, boundaries);
|
|
4485
|
-
}
|
|
4486
|
-
const primary = rawMatches[0];
|
|
4487
|
-
const actualFromId = findBoundaryIdForOffset(primary.start, boundaries);
|
|
4488
|
-
const actualToId = findBoundaryIdForOffset(primary.end, boundaries);
|
|
4489
|
-
return [createIssue("page_attribution_mismatch", segment, segmentIndex, {
|
|
4490
|
-
actual: {
|
|
4491
|
-
from: segment.from,
|
|
4492
|
-
to: segment.to
|
|
4493
|
-
},
|
|
4494
|
-
evidence: `Content found in joined content at page ${actualFromId}, but segment.from=${segment.from}.`,
|
|
4495
|
-
expected: {
|
|
4496
|
-
from: actualFromId,
|
|
4497
|
-
to: actualToId
|
|
4498
|
-
},
|
|
4499
|
-
matchIndex: primary.start
|
|
4500
|
-
}, pageMap)];
|
|
4501
|
-
};
|
|
4502
|
-
/**
|
|
4503
|
-
* Calculates the search range end index based on segment.to or strict bounds.
|
|
4504
|
-
*/
|
|
4505
|
-
const getSearchRange = (segment, expectedBoundary, boundaryMap, joinedLength) => {
|
|
4506
|
-
let searchEnd = expectedBoundary.end + 1;
|
|
4507
|
-
if (segment.to !== void 0) {
|
|
4508
|
-
const endBoundary = boundaryMap.get(segment.to);
|
|
4509
|
-
if (endBoundary) searchEnd = endBoundary.end + 1;
|
|
4510
|
-
else searchEnd = Math.min(joinedLength, expectedBoundary.end + 5e4);
|
|
4511
|
-
}
|
|
4512
|
-
return searchEnd;
|
|
4513
|
-
};
|
|
4514
|
-
/**
|
|
4515
|
-
* Validates attribution for a single segment by searching for its content in the joined text.
|
|
4516
|
-
* Returns issues if content is missing, mis-attributed, or violates page limits.
|
|
4517
|
-
*/
|
|
4518
|
-
const getAttributionIssues = (segment, segmentIndex, maxPages, joined, boundaries, boundaryMap, pageMap, validationOptions) => {
|
|
4519
|
-
if (!segment.content) return [createIssue("content_not_found", segment, segmentIndex, { evidence: "Segment content is empty." }, pageMap)];
|
|
4520
|
-
const expectedBoundary = boundaryMap.get(segment.from);
|
|
4521
|
-
if (!expectedBoundary) return handleMissingBoundary(segment, segmentIndex, joined, boundaries, pageMap);
|
|
4522
|
-
const searchEnd = getSearchRange(segment, expectedBoundary, boundaryMap, joined.length);
|
|
4523
|
-
const searchStart = expectedBoundary.start;
|
|
4524
|
-
const idx = joined.indexOf(segment.content, searchStart);
|
|
4525
|
-
if (idx !== -1 && idx < searchEnd) return checkMaxPagesViolation(segment, segmentIndex, maxPages, idx + segment.content.length - 1, expectedBoundary.end, boundaries);
|
|
4526
|
-
return handleFallbackSearch(segment, segmentIndex, joined, searchStart, searchEnd, expectedBoundary, boundaries, pageMap, maxPages, validationOptions);
|
|
4527
|
-
};
|
|
4528
|
-
/**
|
|
4529
|
-
* Performs purely static checks on the segment metadata (Ids and spans) before expensive content searching.
|
|
4530
|
-
*/
|
|
4531
|
-
const checkStaticMaxPages = (segment, index, maxPages) => {
|
|
4532
|
-
if (maxPages === void 0 || segment.to === void 0) return null;
|
|
4533
|
-
if (maxPages === 0) {
|
|
4534
|
-
if (segment.to !== segment.from) return createIssue("max_pages_violation", segment, index, {
|
|
4535
|
-
evidence: "maxPages=0 requires all segments to stay within one page.",
|
|
4536
|
-
expected: {
|
|
4537
|
-
from: segment.from,
|
|
4538
|
-
to: segment.from
|
|
4539
|
-
},
|
|
4540
|
-
hint: "Check boundary detection in breakpoint-utils.ts."
|
|
4541
|
-
});
|
|
4542
|
-
return null;
|
|
4543
|
-
}
|
|
4544
|
-
const span = segment.to - segment.from;
|
|
4545
|
-
if (span > maxPages) return createIssue("max_pages_violation", segment, index, {
|
|
4546
|
-
evidence: `Segment spans ${span} pages (maxPages=${maxPages}).`,
|
|
4547
|
-
expected: {
|
|
4548
|
-
from: segment.from,
|
|
4549
|
-
to: segment.from + maxPages
|
|
4550
|
-
},
|
|
4551
|
-
hint: "Check breakpoint windowing and page attribution in breakpoint-processor.ts."
|
|
4552
|
-
});
|
|
4553
|
-
return null;
|
|
4554
|
-
};
|
|
4555
|
-
/**
|
|
4556
|
-
* Validates a list of segments against the source pages.
|
|
4557
|
-
* checks for:
|
|
4558
|
-
* - Page existence (invalid IDs)
|
|
4559
|
-
* - Content fidelity (content must exist in pages)
|
|
4560
|
-
* - Page attribution (from/to must match content location)
|
|
4561
|
-
* - Page constraints (maxPages violations)
|
|
4562
|
-
*
|
|
4563
|
-
* @param pages Input pages used for segmentation
|
|
4564
|
-
* @param options Operations used during segmentation (for preprocessing/joining consistency)
|
|
4565
|
-
* @param segments The output segments to validate
|
|
4566
|
-
* @param validationOptions Optional settings for validation behavior
|
|
4567
|
-
* @returns A detailed validation report
|
|
4568
|
-
*/
|
|
4569
|
-
const validateSegments = (pages, options, segments, validationOptions) => {
|
|
4570
|
-
const normalizedPages = normalizePages(pages, options);
|
|
4571
|
-
const { boundaries, joined } = buildJoinedContent(normalizedPages, options.pageJoiner === "newline" ? "\n" : " ");
|
|
4572
|
-
const boundaryMap = /* @__PURE__ */ new Map();
|
|
4573
|
-
const pageMap = /* @__PURE__ */ new Map();
|
|
4574
|
-
for (const b of boundaries) boundaryMap.set(b.id, b);
|
|
4575
|
-
for (const p of normalizedPages) pageMap.set(p.id, p);
|
|
4576
|
-
const pageIds = new Set(normalizedPages.map((p) => p.id));
|
|
4577
|
-
const maxPages = options.maxPages;
|
|
4578
|
-
const issues = [];
|
|
4579
|
-
for (let i = 0; i < segments.length; i++) {
|
|
4580
|
-
const segment = segments[i];
|
|
4581
|
-
if (!pageIds.has(segment.from)) {
|
|
4582
|
-
issues.push(createIssue("page_not_found", segment, i));
|
|
4583
|
-
continue;
|
|
4584
|
-
}
|
|
4585
|
-
if (segment.to !== void 0 && !pageIds.has(segment.to)) issues.push(createIssue("page_not_found", segment, i, { evidence: `Segment.to=${segment.to} does not exist in input pages.` }));
|
|
4586
|
-
const staticMaxPageIssue = checkStaticMaxPages(segment, i, maxPages);
|
|
4587
|
-
if (staticMaxPageIssue) issues.push(staticMaxPageIssue);
|
|
4588
|
-
const attributionIssues = getAttributionIssues(segment, i, maxPages, joined, boundaries, boundaryMap, pageMap, validationOptions);
|
|
4589
|
-
issues.push(...attributionIssues);
|
|
4590
|
-
}
|
|
4591
|
-
const errors = issues.filter((issue) => issue.severity === "error").length;
|
|
4592
|
-
const warnings = issues.filter((issue) => issue.severity === "warn").length;
|
|
4593
|
-
return {
|
|
4594
|
-
issues,
|
|
4595
|
-
ok: issues.length === 0,
|
|
4596
|
-
summary: {
|
|
4597
|
-
errors,
|
|
4598
|
-
issues: issues.length,
|
|
4599
|
-
pageCount: pages.length,
|
|
4600
|
-
segmentCount: segments.length,
|
|
4601
|
-
warnings
|
|
4602
|
-
}
|
|
4603
|
-
};
|
|
4604
|
-
};
|
|
4605
|
-
//#endregion
|
|
4606
|
-
export { ARABIC_BASE_LETTER_CLASS, ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, ARABIC_MARKS_CLASS, ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, PATTERN_TYPE_KEYS, TOKEN_PATTERNS, Token, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, condenseEllipsis, containsTokens, createArabicDictionaryEntryRule, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, normalizeArabicForComparison, optimizeRules, removeZeroWidth, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateRules, validateSegments, withCapture };
|
|
163
|
+
export { ARABIC_BASE_LETTER_CLASS, ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, ARABIC_MARKS_CLASS, ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, DictionaryProfileValidationError, PATTERN_TYPE_KEYS, TOKEN_PATTERNS, Token, analyzeCommonLineStarts, analyzeDictionaryMarkdownPages, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, classifyDictionaryHeading, condenseEllipsis, containsTokens, createArabicDictionaryEntryRule, detectTokenPatterns, diagnoseDictionaryProfile, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, normalizeArabicForComparison, optimizeRules, removeZeroWidth, scanDictionaryMarkdownPage, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, suggestSegmentationOptions, templateToRegex, validateDictionaryProfile, validateRules, validateSegments, withCapture };
|
|
4607
164
|
|
|
4608
165
|
//# sourceMappingURL=index.mjs.map
|