flappa-doormal 2.8.0 → 2.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +40 -5
- package/README.md +91 -47
- package/dist/index.d.mts +17 -38
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +107 -21
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/AGENTS.md
CHANGED
|
@@ -26,8 +26,14 @@ Traditional Arabic text segmentation requires:
|
|
|
26
26
|
```text
|
|
27
27
|
src/
|
|
28
28
|
├── index.ts # Main entry point and exports
|
|
29
|
-
├──
|
|
30
|
-
├──
|
|
29
|
+
├── analysis/ # Analysis helpers module
|
|
30
|
+
│ ├── index.ts # Barrel exports for analysis functions
|
|
31
|
+
│ ├── shared.ts # Shared utilities for analysis
|
|
32
|
+
│ ├── line-starts.ts # analyzeCommonLineStarts (line-based patterns)
|
|
33
|
+
│ ├── repeating-sequences.ts # analyzeRepeatingSequences (continuous text N-grams)
|
|
34
|
+
│ └── *.test.ts # Analysis tests
|
|
35
|
+
├── pattern-detection.ts # Token detection for auto-generating rules
|
|
36
|
+
├── pattern-detection.test.ts # Pattern detection tests
|
|
31
37
|
├── recovery.ts # Marker recovery utility (recover mistaken lineStartsAfter)
|
|
32
38
|
├── recovery.test.ts # Marker recovery tests
|
|
33
39
|
└── segmentation/
|
|
@@ -202,7 +208,7 @@ Raw `regex` patterns now support named capture groups for metadata extraction:
|
|
|
202
208
|
|
|
203
209
|
### Breakpoints Post-Processing Algorithm
|
|
204
210
|
|
|
205
|
-
The `breakpoints` option provides a post-processing mechanism for limiting segment size.
|
|
211
|
+
The `breakpoints` option provides a post-processing mechanism for limiting segment size. Breakpoints runs AFTER all structural rules.
|
|
206
212
|
|
|
207
213
|
**API Options:**
|
|
208
214
|
```typescript
|
|
@@ -244,7 +250,7 @@ segmentPages(pages, {
|
|
|
244
250
|
- **`prefer: 'shorter'`**: Finds FIRST match (conservative)
|
|
245
251
|
- **Recursive**: If split result still exceeds `maxPages`, breakpoints runs again
|
|
246
252
|
|
|
247
|
-
> **Note**:
|
|
253
|
+
> **Note**: Older per-rule span limiting approaches were removed in favor of post-processing `breakpoints`.
|
|
248
254
|
|
|
249
255
|
## Design Decisions
|
|
250
256
|
|
|
@@ -357,7 +363,7 @@ bunx biome lint .
|
|
|
357
363
|
|
|
358
364
|
5. **Rule order matters for specificity**: When multiple rules can match the same position, put specific patterns BEFORE generic ones. Example: `## {{raqms:num}} {{dash}}` must come before `##` to capture the number.
|
|
359
365
|
|
|
360
|
-
6. **Post-processing beats per-rule limits**:
|
|
366
|
+
6. **Post-processing beats per-rule limits**: Per-rule span limiting caused premature splits. Moving to post-processing `breakpoints` preserves structural integrity while still limiting segment size.
|
|
361
367
|
|
|
362
368
|
7. **Window padding matters**: When calculating approximate content windows, 50% padding is needed (not 20%) to ensure enough content is captured for `prefer: 'longer'` scenarios.
|
|
363
369
|
|
|
@@ -494,3 +500,32 @@ const quoted = analyzeCommonLineStarts(pages, {
|
|
|
494
500
|
});
|
|
495
501
|
```
|
|
496
502
|
|
|
503
|
+
## Repeating Sequence Analysis (`analyzeRepeatingSequences`)
|
|
504
|
+
|
|
505
|
+
For continuous text **without line breaks** (prose-like content), use `analyzeRepeatingSequences(pages)`. It scans for commonly repeating word/token sequences (N-grams) across pages.
|
|
506
|
+
|
|
507
|
+
Key options:
|
|
508
|
+
- `minElements` / `maxElements`: N-gram size range (default 1-3)
|
|
509
|
+
- `minCount`: Minimum occurrences to include (default 3)
|
|
510
|
+
- `topK`: Maximum patterns to return (default 20)
|
|
511
|
+
- `requireToken`: Only patterns containing `{{tokens}}` (default true)
|
|
512
|
+
- `normalizeArabicDiacritics`: Ignore diacritics when matching (default true)
|
|
513
|
+
|
|
514
|
+
Example:
|
|
515
|
+
```typescript
|
|
516
|
+
import { analyzeRepeatingSequences } from 'flappa-doormal';
|
|
517
|
+
|
|
518
|
+
const patterns = analyzeRepeatingSequences(pages, { minCount: 3, topK: 20 });
|
|
519
|
+
// [{ pattern: '{{naql}}', count: 42, examples: [...] }, ...]
|
|
520
|
+
```
|
|
521
|
+
|
|
522
|
+
## Analysis → Segmentation Workflow
|
|
523
|
+
|
|
524
|
+
Use analysis functions to discover patterns, then pass to `segmentPages()`:
|
|
525
|
+
|
|
526
|
+
1. **Continuous text**: `analyzeRepeatingSequences()` → build rules → `segmentPages()`
|
|
527
|
+
2. **Structured text**: `analyzeCommonLineStarts()` → build rules → `segmentPages()`
|
|
528
|
+
|
|
529
|
+
See README.md for complete examples.
|
|
530
|
+
|
|
531
|
+
|
package/README.md
CHANGED
|
@@ -228,7 +228,6 @@ Control which matches to use:
|
|
|
228
228
|
lineEndsWith: ['\\.'],
|
|
229
229
|
split: 'after',
|
|
230
230
|
occurrence: 'last', // Only split at LAST period on page
|
|
231
|
-
maxSpan: 1, // Apply per-page
|
|
232
231
|
}
|
|
233
232
|
```
|
|
234
233
|
|
|
@@ -406,8 +405,99 @@ Key options:
|
|
|
406
405
|
- If you paste these signatures into `lineStartsWith` / `lineStartsAfter` / `template`, that’s fine: those template pattern types **auto-escape `()[]`** outside `{{tokens}}`.
|
|
407
406
|
- If you paste them into a raw `regex` rule, you may need to escape literal brackets yourself.
|
|
408
407
|
|
|
408
|
+
### Repeating Sequence Analysis (continuous text)
|
|
409
|
+
|
|
410
|
+
For texts without line breaks (continuous prose), use `analyzeRepeatingSequences()`:
|
|
411
|
+
|
|
412
|
+
```typescript
|
|
413
|
+
import { analyzeRepeatingSequences } from 'flappa-doormal';
|
|
414
|
+
|
|
415
|
+
const patterns = analyzeRepeatingSequences(pages, {
|
|
416
|
+
minElements: 2,
|
|
417
|
+
maxElements: 4,
|
|
418
|
+
minCount: 3,
|
|
419
|
+
topK: 20,
|
|
420
|
+
});
|
|
421
|
+
// [{ pattern: "{{naql}}\\s*{{harf}}", count: 42, examples: [...] }, ...]
|
|
422
|
+
```
|
|
423
|
+
|
|
424
|
+
Key options:
|
|
425
|
+
- `minElements` / `maxElements`: N-gram size range (default 1-3)
|
|
426
|
+
- `minCount`: Minimum occurrences to include (default 3)
|
|
427
|
+
- `topK`: Maximum patterns to return (default 20)
|
|
428
|
+
- `requireToken`: Only patterns containing `{{tokens}}` (default true)
|
|
429
|
+
- `normalizeArabicDiacritics`: Ignore diacritics when matching (default true)
|
|
430
|
+
|
|
431
|
+
## Analysis → Segmentation Workflow
|
|
432
|
+
|
|
433
|
+
Use analysis functions to discover patterns, then pass to `segmentPages()`.
|
|
434
|
+
|
|
435
|
+
### Example A: Continuous Text (No Punctuation)
|
|
436
|
+
|
|
437
|
+
For prose-like text without structural line breaks:
|
|
438
|
+
|
|
439
|
+
```typescript
|
|
440
|
+
import { analyzeRepeatingSequences, segmentPages, type Page } from 'flappa-doormal';
|
|
441
|
+
|
|
442
|
+
// Continuous Arabic text with narrator phrases
|
|
443
|
+
const pages: Page[] = [
|
|
444
|
+
{ id: 1, content: 'حدثنا أحمد بن محمد عن عمر قال سمعت النبي حدثنا خالد بن زيد عن علي' },
|
|
445
|
+
{ id: 2, content: 'حدثنا سعيد بن جبير عن ابن عباس أخبرنا يوسف عن أنس' },
|
|
446
|
+
];
|
|
447
|
+
|
|
448
|
+
// Step 1: Discover repeating patterns
|
|
449
|
+
const patterns = analyzeRepeatingSequences(pages, { minCount: 2, topK: 10 });
|
|
450
|
+
// [{ pattern: '{{naql}}', count: 5, examples: [...] }, ...]
|
|
451
|
+
|
|
452
|
+
// Step 2: Build rules from discovered patterns
|
|
453
|
+
const rules = patterns.filter(p => p.count >= 3).map(p => ({
|
|
454
|
+
lineStartsWith: [p.pattern],
|
|
455
|
+
split: 'at' as const,
|
|
456
|
+
fuzzy: true,
|
|
457
|
+
}));
|
|
458
|
+
|
|
459
|
+
// Step 3: Segment
|
|
460
|
+
const segments = segmentPages(pages, { rules });
|
|
461
|
+
// [{ content: 'حدثنا أحمد بن محمد عن عمر قال سمعت النبي', from: 1 }, ...]
|
|
462
|
+
```
|
|
463
|
+
|
|
464
|
+
### Example B: Structured Text (With Numbering)
|
|
465
|
+
|
|
466
|
+
For hadith-style numbered entries:
|
|
467
|
+
|
|
468
|
+
```typescript
|
|
469
|
+
import { analyzeCommonLineStarts, segmentPages, type Page } from 'flappa-doormal';
|
|
470
|
+
|
|
471
|
+
// Numbered hadith text
|
|
472
|
+
const pages: Page[] = [
|
|
473
|
+
{ id: 1, content: '٦٦٩٦ - حَدَّثَنَا أَبُو بَكْرٍ عَنِ النَّبِيِّ\n٦٦٩٧ - أَخْبَرَنَا عُمَرُ قَالَ' },
|
|
474
|
+
{ id: 2, content: '٦٦٩٨ - حَدَّثَنِي مُحَمَّدٌ عَنْ عَائِشَةَ' },
|
|
475
|
+
];
|
|
476
|
+
|
|
477
|
+
// Step 1: Discover common line-start patterns
|
|
478
|
+
const patterns = analyzeCommonLineStarts(pages, { topK: 10, minCount: 2 });
|
|
479
|
+
// [{ pattern: '{{raqms}}\\s*{{dash}}', count: 3, examples: [...] }, ...]
|
|
480
|
+
|
|
481
|
+
// Step 2: Build rules (add named capture for hadith number)
|
|
482
|
+
const topPattern = patterns[0]?.pattern ?? '{{raqms}} {{dash}} ';
|
|
483
|
+
const rules = [{
|
|
484
|
+
lineStartsAfter: [topPattern.replace('{{raqms}}', '{{raqms:num}}')],
|
|
485
|
+
split: 'at' as const,
|
|
486
|
+
meta: { type: 'hadith' }
|
|
487
|
+
}];
|
|
488
|
+
|
|
489
|
+
// Step 3: Segment
|
|
490
|
+
const segments = segmentPages(pages, { rules });
|
|
491
|
+
// [
|
|
492
|
+
// { content: 'حَدَّثَنَا أَبُو بَكْرٍ...', from: 1, meta: { type: 'hadith', num: '٦٦٩٦' } },
|
|
493
|
+
// { content: 'أَخْبَرَنَا عُمَرُ قَالَ', from: 1, meta: { type: 'hadith', num: '٦٦٩٧' } },
|
|
494
|
+
// { content: 'حَدَّثَنِي مُحَمَّدٌ...', from: 2, meta: { type: 'hadith', num: '٦٦٩٨' } },
|
|
495
|
+
// ]
|
|
496
|
+
```
|
|
497
|
+
|
|
409
498
|
## Rule Validation
|
|
410
499
|
|
|
500
|
+
|
|
411
501
|
Use `validateRules()` to detect common mistakes in rule patterns before running segmentation:
|
|
412
502
|
|
|
413
503
|
```typescript
|
|
@@ -619,32 +709,10 @@ const segments = segmentPages(pages, {
|
|
|
619
709
|
lineEndsWith: ['\\.'],
|
|
620
710
|
split: 'after',
|
|
621
711
|
occurrence: 'last',
|
|
622
|
-
maxSpan: 1
|
|
623
|
-
}]
|
|
624
|
-
});
|
|
625
|
-
```
|
|
626
|
-
|
|
627
|
-
### Page Fallback for Unmatched Content
|
|
628
|
-
|
|
629
|
-
When using `maxSpan` to group matches per page, use `fallback: 'page'` to prevent unmatched pages from merging with adjacent segments:
|
|
630
|
-
|
|
631
|
-
```typescript
|
|
632
|
-
const segments = segmentPages(pages, {
|
|
633
|
-
rules: [{
|
|
634
|
-
template: '{{tarqim}}', // Match punctuation marks
|
|
635
|
-
split: 'after',
|
|
636
|
-
occurrence: 'last',
|
|
637
|
-
maxSpan: 1,
|
|
638
|
-
fallback: 'page' // If no punctuation found, segment the page anyway
|
|
639
712
|
}]
|
|
640
713
|
});
|
|
641
714
|
```
|
|
642
715
|
|
|
643
|
-
**Without `fallback`**: Pages without matches merge into the next segment
|
|
644
|
-
**With `fallback: 'page'`**: Each page becomes its own segment even without matches
|
|
645
|
-
|
|
646
|
-
> **Future extensions**: The `fallback` option may support additional values like `'skip'` (omit unmatched content) or `'line'` (split at line breaks) in future versions.
|
|
647
|
-
|
|
648
716
|
### Multiple Rules with Priority
|
|
649
717
|
|
|
650
718
|
```typescript
|
|
@@ -912,9 +980,7 @@ type SplitRule = {
|
|
|
912
980
|
// Split behavior
|
|
913
981
|
split?: 'at' | 'after'; // Default: 'at'
|
|
914
982
|
occurrence?: 'first' | 'last' | 'all';
|
|
915
|
-
maxSpan?: number;
|
|
916
983
|
fuzzy?: boolean;
|
|
917
|
-
fallback?: 'page'; // NEW: Page-boundary fallback
|
|
918
984
|
|
|
919
985
|
// Constraints
|
|
920
986
|
min?: number;
|
|
@@ -1042,28 +1108,6 @@ The library concatenates all pages into a single string for pattern matching acr
|
|
|
1042
1108
|
|
|
1043
1109
|
For typical book processing (up to 6,000 pages), memory usage is well within Node.js defaults. For very large books (40,000+ pages), ensure adequate heap size.
|
|
1044
1110
|
|
|
1045
|
-
### `maxSpan` Sliding Window Behavior
|
|
1046
|
-
|
|
1047
|
-
The `maxSpan` option uses a **sliding window algorithm** based on page ID difference:
|
|
1048
|
-
|
|
1049
|
-
```typescript
|
|
1050
|
-
// maxSpan = maximum page ID difference when looking ahead for split points
|
|
1051
|
-
// Algorithm prefers LONGER segments by looking as far ahead as allowed
|
|
1052
|
-
|
|
1053
|
-
// Pages [1, 2, 3, 4] with maxSpan: 1, occurrence: 'last'
|
|
1054
|
-
// Window from page 1: pages 1-2 (diff <= 1), splits at page 2's last match
|
|
1055
|
-
// Window from page 3: pages 3-4 (diff <= 1), splits at page 4's last match
|
|
1056
|
-
// Result: 2 segments spanning pages 1-2 and 3-4
|
|
1057
|
-
|
|
1058
|
-
// Pages [1, 5, 10] with maxSpan: 1, occurrence: 'last'
|
|
1059
|
-
// Window from page 1: only page 1 (5-1=4 > 1), splits at page 1
|
|
1060
|
-
// Window from page 5: only page 5 (10-5=5 > 1), splits at page 5
|
|
1061
|
-
// Window from page 10: only page 10, splits at page 10
|
|
1062
|
-
// Result: 3 segments (pages too far apart to merge)
|
|
1063
|
-
```
|
|
1064
|
-
|
|
1065
|
-
This is intentional for books where page IDs represent actual page numbers. With `occurrence: 'last'`, the algorithm finds the last match within the lookahead window, creating longer segments where possible.
|
|
1066
|
-
|
|
1067
1111
|
## For AI Agents
|
|
1068
1112
|
|
|
1069
1113
|
See [AGENTS.md](./AGENTS.md) for:
|
package/dist/index.d.mts
CHANGED
|
@@ -238,38 +238,9 @@ type SplitBehavior = {
|
|
|
238
238
|
* - `'first'`: Only split at the first match
|
|
239
239
|
* - `'last'`: Only split at the last match
|
|
240
240
|
*
|
|
241
|
-
* When `maxSpan` is set, occurrence filtering is applied per sliding
|
|
242
|
-
* window rather than globally. With `'last'`, the algorithm prefers
|
|
243
|
-
* longer segments by looking as far ahead as allowed before selecting
|
|
244
|
-
* the last match in the window.
|
|
245
|
-
*
|
|
246
241
|
* @default 'all'
|
|
247
242
|
*/
|
|
248
243
|
occurrence?: 'first' | 'last' | 'all';
|
|
249
|
-
/**
|
|
250
|
-
* Maximum page ID difference allowed when looking ahead for split points.
|
|
251
|
-
*
|
|
252
|
-
* Uses a sliding window algorithm that prefers longer segments:
|
|
253
|
-
* 1. Start from the first page of the current segment
|
|
254
|
-
* 2. Look for matches within pages where `pageId - startPageId <= maxSpan`
|
|
255
|
-
* 3. Apply occurrence filter (e.g., 'last') to select a match
|
|
256
|
-
* 4. Next window starts from the page after the match
|
|
257
|
-
*
|
|
258
|
-
* Examples:
|
|
259
|
-
* - `maxSpan: 1` = look 1 page ahead (segments span at most 2 pages)
|
|
260
|
-
* - `maxSpan: 2` = look 2 pages ahead (segments span at most 3 pages)
|
|
261
|
-
* - `undefined` = no limit (entire content treated as one group)
|
|
262
|
-
*
|
|
263
|
-
* Note: With non-consecutive page IDs, the algorithm uses actual ID
|
|
264
|
-
* difference, not array index. Pages 1 and 5 have a difference of 4.
|
|
265
|
-
*
|
|
266
|
-
* @example
|
|
267
|
-
* // Split at last period, looking up to 1 page ahead
|
|
268
|
-
* // Pages 1,2: split at page 2's last period
|
|
269
|
-
* // Page 3: split at page 3's last period
|
|
270
|
-
* { lineEndsWith: ['.'], split: 'after', occurrence: 'last', maxSpan: 1 }
|
|
271
|
-
*/
|
|
272
|
-
maxSpan?: number;
|
|
273
244
|
/**
|
|
274
245
|
* Enable diacritic-insensitive matching for Arabic text.
|
|
275
246
|
*
|
|
@@ -354,12 +325,6 @@ type RuleConstraints = {
|
|
|
354
325
|
* { lineStartsWith: ['{{bab}}'], split: 'before', meta: { type: 'chapter' } }
|
|
355
326
|
*/
|
|
356
327
|
meta?: Record<string, unknown>;
|
|
357
|
-
/**
|
|
358
|
-
* Fallback behavior when no matches are found within a maxSpan boundary.
|
|
359
|
-
* - 'page': Create split points at page boundaries
|
|
360
|
-
* - undefined: No fallback (current behavior)
|
|
361
|
-
*/
|
|
362
|
-
fallback?: 'page';
|
|
363
328
|
/**
|
|
364
329
|
* Page-start guard: only allow this rule to match at the START of a page if the
|
|
365
330
|
* previous page's last non-whitespace character matches this pattern.
|
|
@@ -388,7 +353,7 @@ type RuleConstraints = {
|
|
|
388
353
|
* Each rule must specify:
|
|
389
354
|
* - **Pattern** (exactly one): `regex`, `template`, `lineStartsWith`,
|
|
390
355
|
* `lineStartsAfter`, or `lineEndsWith`
|
|
391
|
-
* - **Split behavior**: `split` (optional, defaults to `'at'`), `occurrence`, `
|
|
356
|
+
* - **Split behavior**: `split` (optional, defaults to `'at'`), `occurrence`, `fuzzy`
|
|
392
357
|
* - **Constraints** (optional): `min`, `max`, `meta`
|
|
393
358
|
*
|
|
394
359
|
* @example
|
|
@@ -424,7 +389,6 @@ type SplitRule = PatternType & SplitBehavior & RuleConstraints;
|
|
|
424
389
|
type Page = {
|
|
425
390
|
/**
|
|
426
391
|
* Unique page/entry ID used for:
|
|
427
|
-
* - `maxSpan` grouping (segments spanning multiple pages)
|
|
428
392
|
* - `min`/`max` constraint filtering
|
|
429
393
|
* - `from`/`to` tracking in output segments
|
|
430
394
|
*/
|
|
@@ -625,6 +589,21 @@ type SegmentationOptions = {
|
|
|
625
589
|
* rule's metadata is used for each segment.
|
|
626
590
|
*/
|
|
627
591
|
rules?: SplitRule[];
|
|
592
|
+
/**
|
|
593
|
+
* Attach debugging provenance into `segment.meta` indicating which rule and/or breakpoint
|
|
594
|
+
* created the segment boundary.
|
|
595
|
+
*
|
|
596
|
+
* This is opt-in because it increases output size.
|
|
597
|
+
*
|
|
598
|
+
* When enabled (default metaKey: `_flappa`), segments may include:
|
|
599
|
+
* `meta._flappa.rule` and/or `meta._flappa.breakpoint`.
|
|
600
|
+
*/
|
|
601
|
+
debug?: boolean | {
|
|
602
|
+
/** Where to store provenance in meta. @default '_flappa' */
|
|
603
|
+
metaKey?: string;
|
|
604
|
+
/** Which kinds of provenance to include. @default ['rule','breakpoint'] */
|
|
605
|
+
include?: Array<'rule' | 'breakpoint'>;
|
|
606
|
+
};
|
|
628
607
|
/**
|
|
629
608
|
* Maximum pages per segment before breakpoints are applied.
|
|
630
609
|
*
|
|
@@ -770,7 +749,7 @@ type Segment = {
|
|
|
770
749
|
/**
|
|
771
750
|
* Types of validation issues that can be detected.
|
|
772
751
|
*/
|
|
773
|
-
type ValidationIssueType = 'missing_braces' | 'unknown_token' | 'duplicate';
|
|
752
|
+
type ValidationIssueType = 'missing_braces' | 'unknown_token' | 'duplicate' | 'empty_pattern';
|
|
774
753
|
/**
|
|
775
754
|
* A validation issue found in a pattern.
|
|
776
755
|
*/
|
package/dist/index.d.mts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/pattern-validator.ts","../src/segmentation/replace.ts","../src/segmentation/segmenter.ts","../src/segmentation/tokens.ts","../src/analysis/line-starts.ts","../src/analysis/repeating-sequences.ts","../src/detection.ts","../src/recovery.ts"],"sourcesContent":[],"mappings":";;AAkEA;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;
|
|
1
|
+
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/pattern-validator.ts","../src/segmentation/replace.ts","../src/segmentation/segmenter.ts","../src/segmentation/tokens.ts","../src/analysis/line-starts.ts","../src/analysis/repeating-sequences.ts","../src/detection.ts","../src/recovery.ts"],"sourcesContent":[],"mappings":";;AAkEA;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA6DzB;AAAkD;AAyHlD;;;;;AAkBA;AAoCA;AA0EY,cDzZC,WCyZqB,EAAA,CAAA,CAAA,EAAA,MAAc,EAAA,GAAA,MAAA;AA8BhD;AAuBA;AA+CA;;;;;;;AAwJA;;;;AC1sBA;AAKA;AAUA;;;;;;AA2GA;;;;AC3HA;AA2DA;;;;;;;;AC+SA;AAAoC,cJrNvB,wBIqNuB,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;AJpTpC;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA6DzB;AAAkD;AAyHlD;;;;;AAkBA,KA/UK,YAAA,GA+UW;EAoCJ;EA0EA,KAAA,EAAA,MAAU;AA8BtB,CAAA;AAuBA;AA+CA;;;;;;;AAwJA;;;;AC1sBA;AAKA;AAUA;;;;;;AA2GA;;;KD7EK,eAAA;EE9CO;EA2DC,QAAA,EAAA,MAAA;CAA4B;;;;;;;AC+SzC;;;;;;;;AClTA;AA0QA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBA;AAgBA;;KJlnBK,qBAAA;;EKnEO,cAAA,EAAA,MAAA,EAAA;AAcZ,CAAA;AAEA;AAwQA;;;;;;;;AClRA;AAaA;AAOA;AA2OA;;;;;;;;AC9QA;AA+EA;AAgEA;AAuBA;AAiCA;;;;AC7MA,KRmHK,sBAAA,GQnH6B;EAKtB;EACC,eAAA,EAAA,MAAA,EAAA;CACF;;;;AAKX;AA2BE;AAmnBF;;;;;;;;;AAsDA;;;;;KRrkBK,mBAAA,GQwkB+C;;;;;;;;;;;;;;KRzjB/C,WAAA,GACC,eACA,kBACA,wBACA,yBACA;;;;;;;KAYD,aAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAiDO,SAAA;;;;;;;KAYP,eAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAyCS;;;;;;;;;;;;SAaH;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAuDC,SAAA,GAAY,cAAc,gBAAgB;;;;;;;;;;;;;KAkB1C,IAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAoCA,cAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAqCE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCF,UAAA,YAAsB;;;;;;;;;;;;;;;;;;;;;;;;;UA8BjB,MAAA;;;;;;;;;;;;;;;;;;;;;;KAuBL,WAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA+CA,mBAAA;;;;;;YAME;;;;;;;;UASF;;;;;;;;;;;;;;cAiBY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBA+CN;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;WAwDL;;;;;;;;;;;;;;;;KAiBD,OAAA;;;;;;;;;;;;;;;;;;;;;;;;;;SA6BD;;;;AAttBM;AA4BG;AA8BM;AAyDrB,KCpIO,mBAAA,GDoIY,gBAAA,GAAA,eAAA,GAAA,WAAA,GAAA,eAAA;AAAA;;;AAkBlB,KCjJM,eAAA,GDiJN;EACA,IAAA,ECjJI,mBDiJJ;EACA,OAAA,EAAA,MAAA;EAAmB,UAAA,CAAA,EAAA,MAAA;AAAA,CAAA;AA6DzB;AAAkD;AAyHlD;;AAAsC,KC/T1B,oBAAA,GD+T0B;EAAgB,cAAA,CAAA,EAAA,CC9ThC,eD8TgC,GAAA,SAAA,CAAA,EAAA;EAAe,eAAA,CAAA,EAAA,CC7T9C,eD6T8C,GAAA,SAAA,CAAA,EAAA;EAkBzD,YAAI,CAAA,EAAA,CC9UI,eD8UJ,GAAA,SAAA,CAAA,EAAA;EAoCJ,QAAA,CAAA,ECjXG,eDiXW;AA0E1B,CAAA;AA8BA;AAuBA;AA+CA;;;;;;;AAwJA;;;;AC1sBA;AAKA;AAUA;;;;AAIe,cAuGF,aAvGE,EAAA,CAAA,KAAA,EAuGsB,SAvGtB,EAAA,EAAA,GAAA,CAuGqC,oBAvGrC,GAAA,SAAA,CAAA,EAAA;;;AFkCf;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAuCtB,KEpJO,WAAA,GAAc,WFoJV,CEpJsB,mBFoJtB,CAAA,SAAA,CAAA,CAAA,CAAA,MAAA,CAAA;;;;;;;AAKS;AA6DzB;AAAkD;AAyHlD;AAAwB,cEpRX,iBFoRW,EAAA,CAAA,KAAA,EEpRiB,IFoRjB,EAAA,EAAA,KAAA,CAAA,EEpRiC,WFoRjC,EAAA,EAAA,GEpRiD,IFoRjD,EAAA;;;;;AAkBxB;AAoCA;AA0EA;AA8BA;AAuBA;AA+CA;;;;;;;AAwJA;;;;AC1sBA;AAKA;AAUA;;;;;;AA2GA;;;;AC3HA;AA2DA;;;;;;;;AC+SA;;;;;;;;AClTA;AA0QA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBA;AAgBa,cDpVA,YCoVsF,EAAA,CAAA,KAAA,EDpV/D,ICoV+D,EAAA,EAAA,OAAA,EDpV9C,mBCoV8C,EAAA,GDpV3B,OCoV2B,EAAA;;;;ALxoBnG;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA6DzB;AAAkD;AAyHlD;;;;;AAkBA;AAoCA;AA0EA;AA8BA;AAuBA;AA+CA;;;;;;;AAwJA;;;;AC1sBA;AAKA;AAUA;;;;;;AA2GA;;;;AC3HA;AA2DA;;;;;;;;AC+Sa,cClTA,sBDmXZ,EAAA,CAAA,OAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;;;;;ACnXD;AA0QA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBa,cAvVA,+BAuVmC,EAAA,CAAA,QAAA,EAAA,MAAA,EAAA,GAAA,MAAA;AAqBhD;AAgBA;;;;ACrrBA;AAcA;AAEA;AAwQA;;;;;;;;AClRA;AAaA;AAOA;AA2OA;;;;;;cF0Ga,gBAAgB;;AGxX7B;AA+EA;AAgEA;AAuBA;AAiCA;;;;AC7MA;AAKA;;;;;AAIoC,cJgavB,cIhauB,EAAA,CAAA,KAAA,EAAA,MAAA,EAAA,GAAA,OAAA;AAGpC;AA2BE;AAmnBF;;;;AAIc,KJ1OF,YAAA,GI0OE;EAGa;;;;AA+C3B;EACU,OAAA,EAAA,MAAA;EACsE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;cJvHnE,mHAIV;;;;;;;;;;;;;;;;;;;;cAyCU;;;;;;;;;;;;;;;;;;;;;;cAuBA,uCAAmC;;;;;;;;;;;;;cAqBnC;;;;;;;;;;;;;;;cAgBA;;;ALxoBA,KM7CD,wBAAA,GN6C8E;EA+F7E,IAAA,CAAA,EAAA,MAAA;;;;ECnIR,WAAA,CAAA,EAAA,MAAY;EA4BZ,wBAAe,CAAA,EAAA,OAAA;EA8Bf,yBAAqB,CAAA,EAAA,OAAA;EAiCrB,MAAA,CAAA,EAAA,aAAA,GAAsB,OAAA;EAwBtB,UAAA,CAAA,EAAA,CAAA,IAAA,EAAA,MAAmB,EAAA,MAAA,EAAA,MAAA,EAAA,GAAA,OAAA;EAenB,cAAW,CAAA,EKjIK,MLiIL,EAAA;EACV,UAAA,CAAA,EAAA,OAAA,GAAA,OAAA;CACA;AACA,KKhIM,uBAAA,GLgIN;EACA,IAAA,EAAA,MAAA;EACA,MAAA,EAAA,MAAA;CAAmB;AAYpB,KK5IO,sBAAA,GL4IM;EAiDN,OAAA,EAAA,MAAS;EAYhB,KAAA,EAAA,MAAA;EA6GO,QAAA,EKnTE,uBLmTO,EAAA;CAAG;;;;AAkBZ,cKhEC,uBLgEG,EAAA,CAAA,KAAA,EK/DL,IL+DK,EAAA,EAAA,OAAA,CAAA,EK9DH,wBL8DG,EAAA,GK7Db,sBL6Da,EAAA;;;AA5NX,KMtHO,wBAAA,GNsHY;EAenB,WAAA,CAAA,EAAW,MAAA;EACV,WAAA,CAAA,EAAA,MAAA;EACA,QAAA,CAAA,EAAA,MAAA;EACA,IAAA,CAAA,EAAA,MAAA;EACA,yBAAA,CAAA,EAAA,OAAA;EACA,YAAA,CAAA,EAAA,OAAA;EAAmB,UAAA,CAAA,EAAA,OAAA,GAAA,OAAA;EAYpB,WAAA,CAAA,EAAA,MAAa;EAiDN,YAAS,CAAA,EAAA,MAAA;EAYhB,iBAAA,CAAe,EAAA,MAAA;AA6GpB,CAAA;AAAwB,KMnTZ,wBAAA,GNmTY;EAAc,IAAA,EAAA,MAAA;EAAgB,OAAA,EAAA,MAAA;EAAe,MAAA,EAAA,MAAA;EAkBzD,YAAI,EAAA,MAAA,EAAA;AAoChB,CAAA;AA0EY,KM5aA,wBAAA,GN4asB;EA8BjB,OAAA,EAAM,MAAA;EAuBX,KAAA,EAAA,MAAA;EA+CA,QAAA,EM7gBE,wBN6gBiB,EAAA;CAMjB;;;;AAkJd;;;cM7ba,mCACF,kBACG,6BACX;;;;AP3NH;AA+FA;;;;;ACnIiB;AA4BG;AA+Df,KO7GO,eAAA,GP6Ge;EAwBtB;EAeA,KAAA,EAAA,MAAA;EACC;EACA,KAAA,EAAA,MAAA;EACA;EACA,KAAA,EAAA,MAAA;EACA;EAAmB,QAAA,EAAA,MAAA;AAAA,CAAA;AA6DzB;AAAkD;AAyHlD;;;;;AAkBA;AAoCA;AA0EA;AA8BA;AAuBA;AA+CA;;;AAgCwB,cOpgBX,mBPogBW,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GOpgBwB,ePogBxB,EAAA;;;;AAwHxB;;;;AC1sBA;AAKA;AAUA;;;;AAIe,cM2HF,wBN3HE,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,QAAA,EM2HkD,eN3HlD,EAAA,EAAA,GAAA,MAAA;;AAuGf;;;;AC3HA;AA2Da,cK2GA,oBLrFZ,EAAA,CAAA,QAAA,EKsFa,eLtFb,EAAA,EAAA,GAAA;EAtBwC,WAAA,EAAA,gBAAA,GAAA,iBAAA;EAAgB,KAAA,EAAA,OAAA;EAAgB,QAAA,CAAA,EAAA,MAAA;CAAI;;;;AC+S7E;;;AAAwE,cInK3D,kBJmK2D,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA;EAAA,QAAA,EAAA,MAAA;;;;EClT3D,QAAA,EGsJC,eH7Ib,EAAA;AAiQD,CAAA,GAAa,IAAA;;;AL5QA,KS5DD,sBAAA,GT4D8E;EA+F7E,IAAA,EAAA,cAAA;;;;ECnIR,KAAA,CAAA,EAAA,OAAY,GAAA,YAAA;EA4BZ,QAAA,EAAA,MAAA,EAAe;AAAA,CAAA,GA8Bf;EAiCA,IAAA,EAAA,WAAA;EAwBA,SAAA,EAAA,CAAA,IAAA,EQxIwC,SRwIrB,EAAA,KAAA,EAAA,MAAA,EAAA,GAAA,OAAA;AAAA,CAAA;AAgBlB,KQtJM,iBAAA,GRsJN;EACA,OAAA,EQtJO,mBRsJP;EACA,KAAA,EQtJK,IRsJL,EAAA;EACA,QAAA,EQtJQ,ORsJR,EAAA;EACA,QAAA,EQtJQ,sBRsJR;CAAmB;AAYpB,KQ/JO,oBAAA,GR+JM;EAiDN,OAAA,EAAA;IAYP,IAAA,EAAA,YAAe,GAAA,wBAsDH;IAuDL,SAAS,EAAA,MAAA;IAAG,aAAA,EAAA,MAAA;IAAc,SAAA,EAAA,MAAA;IAAgB,UAAA,EAAA,MAAA;EAAe,CAAA;EAkBzD,KAAA,CAAA,EQnVA,KRmVI,CAAA;IAoCJ,SAAA,EAAA,MAAc;IA0Ed,QAAA,EAAU,MAAA;IA8BL,aAAM,EAAA,MAAA;IAuBX,UAAW,EAAA,MAAA;EA+CX,CAAA,CAAA;EAME,OAAA,EQriBD,KRqiBC,CAAA;IASF,IAAA,EAAA,MAAA;IAiBY,oBAAA,EAAA,MAAA;IA+CN,sBAAA,CAAA,EAAA,MAAA;IAwDL,qBAAA,CAAA,EAAA,MAAA;IAAM,YAAA,EAAA,MAAA;IAiBP,MAAO,EAAA,WA6BR,GAAA,oBAAM,GAAA,WAAA,GAAA,sBAAA,GAAA,qBAAA;;;;ECvuBL,CAAA,CAAA;EAKA,MAAA,EAAA,MAAA,EAAA;EAUA,QAAA,EAAA,MAAA,EAAA;CACU;KOkBjB,oBAAA,GPjBkB,MAAA,GAAA,YAAA,GAAA,qBAAA;AACH,iBOioBJ,qCAAA,CPjoBI,KAAA,EOkoBT,IPloBS,EAAA,EAAA,QAAA,EOmoBN,OPnoBM,EAAA,EAAA,OAAA,EOooBP,mBPpoBO,EAAA,QAAA,EOqoBN,sBProBM,EAAA,IAwGpB,CAxGoB,EAAA;EACL,IAAA,CAAA,EAAA,YAAA,GAAA,wBAAA;EAAe,gBAAA,CAAA,EOuoBH,oBPvoBG;AAuG9B,CAAA,CAAA,EAAa;UOkiBA;YAAgC;;AN7pBjC,iBM0sBI,6BAAA,CN1sBU,IAAA,EM2sBhB,iBN3sB2B,EAAA,EAAA,IA2DoB,CA3DpB,EAAA;EA2DxB,IAAA,CAAA,EAAA,YAAA,GAsBZ,wBAAA;EAtBwC,gBAAA,CAAA,EMipBuC,oBNjpBvC;CAAgB,CAAA,EAAA;EAAgB,MAAA,EMkpB5D,oBNlpB4D;EAAI,QAAA,EMkpBhC,ONlpBgC,EAAA"}
|
package/dist/index.mjs
CHANGED
|
@@ -645,6 +645,10 @@ const buildBareTokenRegex = () => {
|
|
|
645
645
|
* Validates a single pattern for common issues.
|
|
646
646
|
*/
|
|
647
647
|
const validatePattern = (pattern, seenPatterns) => {
|
|
648
|
+
if (!pattern.trim()) return {
|
|
649
|
+
message: "Empty pattern is not allowed",
|
|
650
|
+
type: "empty_pattern"
|
|
651
|
+
};
|
|
648
652
|
if (seenPatterns.has(pattern)) return {
|
|
649
653
|
message: `Duplicate pattern: "${pattern}"`,
|
|
650
654
|
type: "duplicate"
|
|
@@ -727,7 +731,7 @@ const validateRules = (rules) => {
|
|
|
727
731
|
hasIssues = true;
|
|
728
732
|
}
|
|
729
733
|
}
|
|
730
|
-
if ("template" in rule && rule.template) {
|
|
734
|
+
if ("template" in rule && rule.template !== void 0) {
|
|
731
735
|
const seenPatterns = /* @__PURE__ */ new Set();
|
|
732
736
|
const issue = validatePattern(rule.template, seenPatterns);
|
|
733
737
|
if (issue) {
|
|
@@ -1245,16 +1249,71 @@ const handlePageBoundaryBreak = (remainingContent, windowEndIdx, windowEndPositi
|
|
|
1245
1249
|
*/
|
|
1246
1250
|
const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, ctx) => {
|
|
1247
1251
|
const { pageIds, normalizedPages, expandedBreakpoints, prefer } = ctx;
|
|
1248
|
-
for (
|
|
1252
|
+
for (let i = 0; i < expandedBreakpoints.length; i++) {
|
|
1253
|
+
const { rule, regex, excludeSet, skipWhenRegex } = expandedBreakpoints[i];
|
|
1249
1254
|
if (!isInBreakpointRange(pageIds[currentFromIdx], rule)) continue;
|
|
1250
1255
|
if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
|
|
1251
1256
|
if (skipWhenRegex?.test(remainingContent)) continue;
|
|
1252
|
-
if (regex === null) return
|
|
1257
|
+
if (regex === null) return {
|
|
1258
|
+
breakpointIndex: i,
|
|
1259
|
+
breakPos: handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages),
|
|
1260
|
+
rule
|
|
1261
|
+
};
|
|
1253
1262
|
const breakPos = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer);
|
|
1254
|
-
if (breakPos > 0) return
|
|
1263
|
+
if (breakPos > 0) return {
|
|
1264
|
+
breakpointIndex: i,
|
|
1265
|
+
breakPos,
|
|
1266
|
+
rule
|
|
1267
|
+
};
|
|
1255
1268
|
}
|
|
1256
|
-
return
|
|
1269
|
+
return null;
|
|
1270
|
+
};
|
|
1271
|
+
|
|
1272
|
+
//#endregion
|
|
1273
|
+
//#region src/segmentation/debug-meta.ts
|
|
1274
|
+
const resolveDebugConfig = (debug) => {
|
|
1275
|
+
if (!debug) return null;
|
|
1276
|
+
if (debug === true) return {
|
|
1277
|
+
includeBreakpoint: true,
|
|
1278
|
+
includeRule: true,
|
|
1279
|
+
metaKey: "_flappa"
|
|
1280
|
+
};
|
|
1281
|
+
if (typeof debug !== "object") return null;
|
|
1282
|
+
const metaKey = debug.metaKey;
|
|
1283
|
+
const include = debug.include;
|
|
1284
|
+
const includeRule = Array.isArray(include) ? include.includes("rule") : true;
|
|
1285
|
+
return {
|
|
1286
|
+
includeBreakpoint: Array.isArray(include) ? include.includes("breakpoint") : true,
|
|
1287
|
+
includeRule,
|
|
1288
|
+
metaKey: typeof metaKey === "string" && metaKey ? metaKey : "_flappa"
|
|
1289
|
+
};
|
|
1290
|
+
};
|
|
1291
|
+
const getRulePatternType = (rule) => {
|
|
1292
|
+
if ("lineStartsWith" in rule) return "lineStartsWith";
|
|
1293
|
+
if ("lineStartsAfter" in rule) return "lineStartsAfter";
|
|
1294
|
+
if ("lineEndsWith" in rule) return "lineEndsWith";
|
|
1295
|
+
if ("template" in rule) return "template";
|
|
1296
|
+
return "regex";
|
|
1297
|
+
};
|
|
1298
|
+
const isPlainObject = (v) => Boolean(v) && typeof v === "object" && !Array.isArray(v);
|
|
1299
|
+
const mergeDebugIntoMeta = (meta, metaKey, patch) => {
|
|
1300
|
+
const out = meta ? { ...meta } : {};
|
|
1301
|
+
const existing = out[metaKey];
|
|
1302
|
+
out[metaKey] = {
|
|
1303
|
+
...isPlainObject(existing) ? existing : {},
|
|
1304
|
+
...patch
|
|
1305
|
+
};
|
|
1306
|
+
return out;
|
|
1257
1307
|
};
|
|
1308
|
+
const buildRuleDebugPatch = (ruleIndex, rule) => ({ rule: {
|
|
1309
|
+
index: ruleIndex,
|
|
1310
|
+
patternType: getRulePatternType(rule)
|
|
1311
|
+
} });
|
|
1312
|
+
const buildBreakpointDebugPatch = (breakpointIndex, rule) => ({ breakpoint: {
|
|
1313
|
+
index: breakpointIndex,
|
|
1314
|
+
kind: rule.pattern === "" ? "pageBoundary" : "pattern",
|
|
1315
|
+
pattern: rule.pattern
|
|
1316
|
+
} });
|
|
1258
1317
|
|
|
1259
1318
|
//#endregion
|
|
1260
1319
|
//#region src/segmentation/breakpoint-processor.ts
|
|
@@ -1338,15 +1397,20 @@ const createPieceSegment = (pieceContent, actualStartIdx, actualEndIdx, pageIds,
|
|
|
1338
1397
|
const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer) => {
|
|
1339
1398
|
if (hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx)) {
|
|
1340
1399
|
const exclusionBreak = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
|
|
1341
|
-
if (exclusionBreak > 0) return exclusionBreak;
|
|
1400
|
+
if (exclusionBreak > 0) return { breakOffset: exclusionBreak };
|
|
1342
1401
|
}
|
|
1343
|
-
const
|
|
1402
|
+
const patternMatch = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
|
|
1344
1403
|
expandedBreakpoints,
|
|
1345
1404
|
normalizedPages,
|
|
1346
1405
|
pageIds,
|
|
1347
1406
|
prefer
|
|
1348
1407
|
});
|
|
1349
|
-
|
|
1408
|
+
if (patternMatch && patternMatch.breakPos > 0) return {
|
|
1409
|
+
breakOffset: patternMatch.breakPos,
|
|
1410
|
+
breakpointIndex: patternMatch.breakpointIndex,
|
|
1411
|
+
breakpointRule: patternMatch.rule
|
|
1412
|
+
};
|
|
1413
|
+
return { breakOffset: windowEndPosition };
|
|
1350
1414
|
};
|
|
1351
1415
|
/**
|
|
1352
1416
|
* Advances cursor position past any leading whitespace.
|
|
@@ -1362,12 +1426,13 @@ const skipWhitespace$1 = (content, startPos) => {
|
|
|
1362
1426
|
*
|
|
1363
1427
|
* Uses precomputed boundary positions for O(log n) page attribution lookups.
|
|
1364
1428
|
*/
|
|
1365
|
-
const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger) => {
|
|
1429
|
+
const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey) => {
|
|
1366
1430
|
const result = [];
|
|
1367
1431
|
const fullContent = segment.content;
|
|
1368
1432
|
let cursorPos = 0;
|
|
1369
1433
|
let currentFromIdx = fromIdx;
|
|
1370
1434
|
let isFirstPiece = true;
|
|
1435
|
+
let lastBreakpoint = null;
|
|
1371
1436
|
const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
|
|
1372
1437
|
logger?.debug?.("[breakpoints] boundaryPositions built", {
|
|
1373
1438
|
boundaryPositions,
|
|
@@ -1382,7 +1447,9 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
|
|
|
1382
1447
|
const remainingSpan = computeRemainingSpan(currentFromIdx, toIdx, pageIds);
|
|
1383
1448
|
const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
|
|
1384
1449
|
if (remainingSpan <= maxPages && !remainingHasExclusions) {
|
|
1385
|
-
const
|
|
1450
|
+
const includeMeta = isFirstPiece || Boolean(debugMetaKey);
|
|
1451
|
+
const meta = debugMetaKey && lastBreakpoint ? mergeDebugIntoMeta(includeMeta ? segment.meta : void 0, debugMetaKey, buildBreakpointDebugPatch(lastBreakpoint.breakpointIndex, lastBreakpoint.rule)) : includeMeta ? segment.meta : void 0;
|
|
1452
|
+
const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, meta, includeMeta);
|
|
1386
1453
|
if (finalSeg) result.push(finalSeg);
|
|
1387
1454
|
break;
|
|
1388
1455
|
}
|
|
@@ -1393,8 +1460,12 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
|
|
|
1393
1460
|
cursorPos,
|
|
1394
1461
|
windowEndIdx
|
|
1395
1462
|
});
|
|
1396
|
-
const
|
|
1397
|
-
|
|
1463
|
+
const found = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer);
|
|
1464
|
+
if (found.breakpointIndex !== void 0 && found.breakpointRule) lastBreakpoint = {
|
|
1465
|
+
breakpointIndex: found.breakpointIndex,
|
|
1466
|
+
rule: found.breakpointRule
|
|
1467
|
+
};
|
|
1468
|
+
const breakPos = cursorPos + found.breakOffset;
|
|
1398
1469
|
const pieceContent = fullContent.slice(cursorPos, breakPos).trim();
|
|
1399
1470
|
const { actualEndIdx, actualStartIdx } = computePiecePages(cursorPos, breakPos, boundaryPositions, fromIdx, toIdx);
|
|
1400
1471
|
logger?.trace?.("[breakpoints] piece", {
|
|
@@ -1403,7 +1474,8 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
|
|
|
1403
1474
|
pieceLength: pieceContent.length
|
|
1404
1475
|
});
|
|
1405
1476
|
if (pieceContent) {
|
|
1406
|
-
const
|
|
1477
|
+
const includeMeta = isFirstPiece || Boolean(debugMetaKey);
|
|
1478
|
+
const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, debugMetaKey && lastBreakpoint ? mergeDebugIntoMeta(includeMeta ? segment.meta : void 0, debugMetaKey, buildBreakpointDebugPatch(lastBreakpoint.breakpointIndex, lastBreakpoint.rule)) : includeMeta ? segment.meta : void 0, includeMeta);
|
|
1407
1479
|
if (pieceSeg) result.push(pieceSeg);
|
|
1408
1480
|
}
|
|
1409
1481
|
cursorPos = skipWhitespace$1(fullContent, breakPos);
|
|
@@ -1418,7 +1490,7 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
|
|
|
1418
1490
|
*
|
|
1419
1491
|
* Note: This is an internal engine used by `segmentPages()`.
|
|
1420
1492
|
*/
|
|
1421
|
-
const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space") => {
|
|
1493
|
+
const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space", debugMetaKey) => {
|
|
1422
1494
|
const pageIds = pages.map((p) => p.id);
|
|
1423
1495
|
const pageIdToIndex = buildPageIdToIndexMap(pageIds);
|
|
1424
1496
|
const normalizedPages = buildNormalizedPagesMap(pages, normalizedContent);
|
|
@@ -1446,7 +1518,7 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
|
|
|
1446
1518
|
result.push(segment);
|
|
1447
1519
|
continue;
|
|
1448
1520
|
}
|
|
1449
|
-
const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger);
|
|
1521
|
+
const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey);
|
|
1450
1522
|
result.push(...broken.map((s) => {
|
|
1451
1523
|
const segFromIdx = pageIdToIndex.get(s.from) ?? -1;
|
|
1452
1524
|
const segToIdx = s.to !== void 0 ? pageIdToIndex.get(s.to) ?? segFromIdx : segFromIdx;
|
|
@@ -2059,13 +2131,25 @@ const findMatchesInContent = (content, regex, usesCapture, captureNames) => {
|
|
|
2059
2131
|
}
|
|
2060
2132
|
return matches;
|
|
2061
2133
|
};
|
|
2062
|
-
const applyOccurrenceFilter = (rules, splitPointsByRule) => {
|
|
2134
|
+
const applyOccurrenceFilter = (rules, splitPointsByRule, debugMetaKey) => {
|
|
2063
2135
|
const result = [];
|
|
2064
2136
|
rules.forEach((rule, index) => {
|
|
2065
2137
|
const points = splitPointsByRule.get(index);
|
|
2066
2138
|
if (!points?.length) return;
|
|
2067
2139
|
const filtered = rule.occurrence === "first" ? [points[0]] : rule.occurrence === "last" ? [points.at(-1)] : points;
|
|
2068
|
-
|
|
2140
|
+
if (!debugMetaKey) {
|
|
2141
|
+
result.push(...filtered.map((p) => ({
|
|
2142
|
+
...p,
|
|
2143
|
+
ruleIndex: index
|
|
2144
|
+
})));
|
|
2145
|
+
return;
|
|
2146
|
+
}
|
|
2147
|
+
const debugPatch = buildRuleDebugPatch(index, rule);
|
|
2148
|
+
result.push(...filtered.map((p) => ({
|
|
2149
|
+
...p,
|
|
2150
|
+
meta: mergeDebugIntoMeta(p.meta, debugMetaKey, debugPatch),
|
|
2151
|
+
ruleIndex: index
|
|
2152
|
+
})));
|
|
2069
2153
|
});
|
|
2070
2154
|
return result;
|
|
2071
2155
|
};
|
|
@@ -2203,7 +2287,7 @@ const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) =
|
|
|
2203
2287
|
if (lastPage.id !== firstPage.id) initialSeg.to = lastPage.id;
|
|
2204
2288
|
return [initialSeg];
|
|
2205
2289
|
};
|
|
2206
|
-
const collectSplitPointsFromRules = (rules, matchContent, pageMap, logger) => {
|
|
2290
|
+
const collectSplitPointsFromRules = (rules, matchContent, pageMap, debugMetaKey, logger) => {
|
|
2207
2291
|
logger?.debug?.("[segmenter] collecting split points from rules", {
|
|
2208
2292
|
contentLength: matchContent.length,
|
|
2209
2293
|
ruleCount: rules.length
|
|
@@ -2218,7 +2302,7 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap, logger) => {
|
|
|
2218
2302
|
const splitPointsByRule = collectFastFuzzySplitPoints(matchContent, pageMap, fastFuzzyRules, passesPageStartGuard);
|
|
2219
2303
|
if (combinableRules.length > 0) processCombinedMatches(matchContent, combinableRules, buildRuleRegexes(combinableRules), pageMap, passesPageStartGuard, splitPointsByRule, logger);
|
|
2220
2304
|
for (const rule of standaloneRules) processStandaloneRule(rule, rules.indexOf(rule), matchContent, pageMap, passesPageStartGuard, splitPointsByRule);
|
|
2221
|
-
return applyOccurrenceFilter(rules, splitPointsByRule);
|
|
2305
|
+
return applyOccurrenceFilter(rules, splitPointsByRule, debugMetaKey);
|
|
2222
2306
|
};
|
|
2223
2307
|
/**
|
|
2224
2308
|
* Finds page breaks within a given offset range using binary search.
|
|
@@ -2321,6 +2405,8 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
|
|
|
2321
2405
|
*/
|
|
2322
2406
|
const segmentPages = (pages, options) => {
|
|
2323
2407
|
const { rules = [], maxPages = 0, breakpoints = [], prefer = "longer", pageJoiner = "space", logger } = options;
|
|
2408
|
+
const debug = resolveDebugConfig(options.debug);
|
|
2409
|
+
const debugMetaKey = debug?.includeRule ? debug.metaKey : void 0;
|
|
2324
2410
|
logger?.info?.("[segmenter] starting segmentation", {
|
|
2325
2411
|
breakpointCount: breakpoints.length,
|
|
2326
2412
|
maxPages,
|
|
@@ -2334,7 +2420,7 @@ const segmentPages = (pages, options) => {
|
|
|
2334
2420
|
pageIds: pageMap.pageIds,
|
|
2335
2421
|
totalContentLength: matchContent.length
|
|
2336
2422
|
});
|
|
2337
|
-
const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap, logger);
|
|
2423
|
+
const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap, debugMetaKey, logger);
|
|
2338
2424
|
const unique = dedupeSplitPoints(splitPoints);
|
|
2339
2425
|
logger?.debug?.("[segmenter] split points collected", {
|
|
2340
2426
|
rawSplitPoints: splitPoints.length,
|
|
@@ -2353,7 +2439,7 @@ const segmentPages = (pages, options) => {
|
|
|
2353
2439
|
if (maxPages >= 0 && breakpoints.length) {
|
|
2354
2440
|
logger?.debug?.("[segmenter] applying breakpoints to oversized segments");
|
|
2355
2441
|
const patternProcessor = (p) => processPattern(p, false).pattern;
|
|
2356
|
-
const result = applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
|
|
2442
|
+
const result = applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner, debug?.includeBreakpoint ? debug.metaKey : void 0);
|
|
2357
2443
|
logger?.info?.("[segmenter] segmentation complete (with breakpoints)", { finalSegmentCount: result.length });
|
|
2358
2444
|
return result;
|
|
2359
2445
|
}
|