flappa-doormal 2.7.0 → 2.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +80 -5
- package/README.md +138 -47
- package/dist/index.d.mts +113 -95
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +1072 -285
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/AGENTS.md
CHANGED
|
@@ -26,8 +26,16 @@ Traditional Arabic text segmentation requires:
|
|
|
26
26
|
```text
|
|
27
27
|
src/
|
|
28
28
|
├── index.ts # Main entry point and exports
|
|
29
|
-
├──
|
|
30
|
-
├──
|
|
29
|
+
├── analysis/ # Analysis helpers module
|
|
30
|
+
│ ├── index.ts # Barrel exports for analysis functions
|
|
31
|
+
│ ├── shared.ts # Shared utilities for analysis
|
|
32
|
+
│ ├── line-starts.ts # analyzeCommonLineStarts (line-based patterns)
|
|
33
|
+
│ ├── repeating-sequences.ts # analyzeRepeatingSequences (continuous text N-grams)
|
|
34
|
+
│ └── *.test.ts # Analysis tests
|
|
35
|
+
├── pattern-detection.ts # Token detection for auto-generating rules
|
|
36
|
+
├── pattern-detection.test.ts # Pattern detection tests
|
|
37
|
+
├── recovery.ts # Marker recovery utility (recover mistaken lineStartsAfter)
|
|
38
|
+
├── recovery.test.ts # Marker recovery tests
|
|
31
39
|
└── segmentation/
|
|
32
40
|
├── types.ts # TypeScript type definitions for rules/segments
|
|
33
41
|
├── segmenter.ts # Core segmentation engine (segmentPages)
|
|
@@ -56,6 +64,11 @@ src/
|
|
|
56
64
|
- Takes array of `{id, content}` pages and split rules
|
|
57
65
|
- Returns array of `{content, from, to?, meta?}` segments
|
|
58
66
|
|
|
67
|
+
1. **`recoverMistakenLineStartsAfterMarkers(pages, segments, options, selector)`** - Recovery helper
|
|
68
|
+
- Use when a client mistakenly used `lineStartsAfter` where `lineStartsWith` was intended
|
|
69
|
+
- Deterministic mode reruns segmentation with selected rules converted to `lineStartsWith` and merges recovered `content` back into the provided segments
|
|
70
|
+
- Optional `mode: 'best_effort_then_rerun'` attempts a conservative anchor-based recovery first, then falls back to rerun for unresolved segments
|
|
71
|
+
|
|
59
72
|
2. **`tokens.ts`** - Template system
|
|
60
73
|
- `TOKEN_PATTERNS` - Map of token names to regex patterns
|
|
61
74
|
- `expandTokensWithCaptures()` - Expands `{{token:name}}` syntax
|
|
@@ -195,7 +208,7 @@ Raw `regex` patterns now support named capture groups for metadata extraction:
|
|
|
195
208
|
|
|
196
209
|
### Breakpoints Post-Processing Algorithm
|
|
197
210
|
|
|
198
|
-
The `breakpoints` option provides a post-processing mechanism for limiting segment size.
|
|
211
|
+
The `breakpoints` option provides a post-processing mechanism for limiting segment size. Breakpoints runs AFTER all structural rules.
|
|
199
212
|
|
|
200
213
|
**API Options:**
|
|
201
214
|
```typescript
|
|
@@ -237,7 +250,7 @@ segmentPages(pages, {
|
|
|
237
250
|
- **`prefer: 'shorter'`**: Finds FIRST match (conservative)
|
|
238
251
|
- **Recursive**: If split result still exceeds `maxPages`, breakpoints runs again
|
|
239
252
|
|
|
240
|
-
> **Note**:
|
|
253
|
+
> **Note**: Older per-rule span limiting approaches were removed in favor of post-processing `breakpoints`.
|
|
241
254
|
|
|
242
255
|
## Design Decisions
|
|
243
256
|
|
|
@@ -350,7 +363,7 @@ bunx biome lint .
|
|
|
350
363
|
|
|
351
364
|
5. **Rule order matters for specificity**: When multiple rules can match the same position, put specific patterns BEFORE generic ones. Example: `## {{raqms:num}} {{dash}}` must come before `##` to capture the number.
|
|
352
365
|
|
|
353
|
-
6. **Post-processing beats per-rule limits**:
|
|
366
|
+
6. **Post-processing beats per-rule limits**: Per-rule span limiting caused premature splits. Moving to post-processing `breakpoints` preserves structural integrity while still limiting segment size.
|
|
354
367
|
|
|
355
368
|
7. **Window padding matters**: When calculating approximate content windows, 50% padding is needed (not 20%) to ensure enough content is captured for `prefer: 'longer'` scenarios.
|
|
356
369
|
|
|
@@ -362,6 +375,39 @@ bunx biome lint .
|
|
|
362
375
|
|
|
363
376
|
11. **Boundary-position algorithm improves page attribution**: Building a position map of page boundaries once per segment (O(n)) enables binary search for O(log n) lookups per piece. Key insight: when a segment starts mid-page (common after structural rules), expected boundary estimates must account for the offset into the starting page. Without this adjustment, position-based lookups can return the wrong page when pages have identical content prefixes.
|
|
364
377
|
|
|
378
|
+
### For Future AI Agents (Recovery + Repo gotchas)
|
|
379
|
+
|
|
380
|
+
1. **`lineStartsAfter` vs `lineStartsWith` is not “cosmetic”**: `lineStartsAfter` changes output by stripping the matched marker via an internal `contentStartOffset` during segment construction. If a client used it by accident, you cannot reconstruct the exact stripped prefix from output alone without referencing the original pages and re-matching the marker.
|
|
381
|
+
|
|
382
|
+
2. **Recovery must mirror segmentation’s preprocessing**: If `SegmentationOptions.replace` was used, recovery must apply the same replacements (see `src/segmentation/replace.ts`) before attempting anchoring or rerun alignment, otherwise substring matching and page joins will drift.
|
|
383
|
+
|
|
384
|
+
3. **Page joining differs between matching and output**:
|
|
385
|
+
- Matching always happens on pages concatenated with `\\n` separators.
|
|
386
|
+
- Output segments may normalize page boundaries (`pageJoiner: 'space' | 'newline'`) and breakpoints post-processing uses its own join normalization utilities.
|
|
387
|
+
Recovery code must be explicit about which representation it’s searching.
|
|
388
|
+
|
|
389
|
+
4. **Breakpoints can produce “pieces” that were never marker-stripped**: When `maxPages` + `breakpoints` are enabled, only the piece that starts at the original structural boundary could have lost a marker due to `lineStartsAfter`. Mid-segment breakpoint pieces should not be “recovered” unless you can anchor them confidently.
|
|
390
|
+
|
|
391
|
+
5. **Fuzzy defaults are easy to miss**: Some tokens auto-enable fuzzy matching unless `fuzzy: false` is set (`bab`, `basmalah`, `fasl`, `kitab`, `naql`). If you are validating markers or re-matching prefixes, use the same compilation path as segmentation (`buildRuleRegex` / `processPattern`) so diacritics and token expansion behave identically.
|
|
392
|
+
|
|
393
|
+
6. **Auto-escaping applies to template-like patterns**: `lineStartsWith`, `lineStartsAfter`, `lineEndsWith`, and `template` auto-escape `()[]` outside `{{tokens}}`. Raw `regex` does not. If you compare patterns by string equality, be careful about escaping and whitespace.
|
|
394
|
+
|
|
395
|
+
7. **TypeScript union pitfalls with `SplitRule`**: `SplitRule` is a union where only one pattern type should exist. Avoid mutating rules in-place with `delete` on fields (TS often narrows unions and then complains). Prefer rebuilding converted rules via destructuring (e.g. `{ lineStartsAfter, ...rest }` then create `{...rest, lineStartsWith: lineStartsAfter}`).
|
|
396
|
+
|
|
397
|
+
8. **Biome lint constraints shape implementation**: The repo enforces low function complexity. Expect to extract helpers (alignment, selector resolution, anchoring) to keep Biome happy. Also, Biome can flag regex character-class usage as misleading; prefer alternation (e.g. `(?:\\u200C|\\u200D|\\uFEFF)`) when removing specific codepoints.
|
|
398
|
+
|
|
399
|
+
9. **When debugging recovery, start here**:
|
|
400
|
+
- `src/segmentation/segmenter.ts` (how content is sliced/trimmed and how `from/to` are computed)
|
|
401
|
+
- `src/segmentation/rule-regex.ts` + `src/segmentation/tokens.ts` (token expansion + fuzzy behavior)
|
|
402
|
+
- `src/segmentation/replace.ts` (preprocessing parity)
|
|
403
|
+
- `src/recovery.ts` (recovery implementation)
|
|
404
|
+
|
|
405
|
+
### Process Template (Multi-agent design review, TDD-first)
|
|
406
|
+
|
|
407
|
+
If you want to repeat the “write a plan → get multiple AI critiques → synthesize → update plan → implement TDD-first” workflow, use:
|
|
408
|
+
|
|
409
|
+
- `docs/ai-multi-agent-tdd-template.md`
|
|
410
|
+
|
|
365
411
|
### Architecture Insights
|
|
366
412
|
|
|
367
413
|
- **Declarative > Imperative**: Users describe patterns, library handles regex
|
|
@@ -454,3 +500,32 @@ const quoted = analyzeCommonLineStarts(pages, {
|
|
|
454
500
|
});
|
|
455
501
|
```
|
|
456
502
|
|
|
503
|
+
## Repeating Sequence Analysis (`analyzeRepeatingSequences`)
|
|
504
|
+
|
|
505
|
+
For continuous text **without line breaks** (prose-like content), use `analyzeRepeatingSequences(pages)`. It scans for commonly repeating word/token sequences (N-grams) across pages.
|
|
506
|
+
|
|
507
|
+
Key options:
|
|
508
|
+
- `minElements` / `maxElements`: N-gram size range (default 1-3)
|
|
509
|
+
- `minCount`: Minimum occurrences to include (default 3)
|
|
510
|
+
- `topK`: Maximum patterns to return (default 20)
|
|
511
|
+
- `requireToken`: Only patterns containing `{{tokens}}` (default true)
|
|
512
|
+
- `normalizeArabicDiacritics`: Ignore diacritics when matching (default true)
|
|
513
|
+
|
|
514
|
+
Example:
|
|
515
|
+
```typescript
|
|
516
|
+
import { analyzeRepeatingSequences } from 'flappa-doormal';
|
|
517
|
+
|
|
518
|
+
const patterns = analyzeRepeatingSequences(pages, { minCount: 3, topK: 20 });
|
|
519
|
+
// [{ pattern: '{{naql}}', count: 42, examples: [...] }, ...]
|
|
520
|
+
```
|
|
521
|
+
|
|
522
|
+
## Analysis → Segmentation Workflow
|
|
523
|
+
|
|
524
|
+
Use analysis functions to discover patterns, then pass to `segmentPages()`:
|
|
525
|
+
|
|
526
|
+
1. **Continuous text**: `analyzeRepeatingSequences()` → build rules → `segmentPages()`
|
|
527
|
+
2. **Structured text**: `analyzeCommonLineStarts()` → build rules → `segmentPages()`
|
|
528
|
+
|
|
529
|
+
See README.md for complete examples.
|
|
530
|
+
|
|
531
|
+
|
package/README.md
CHANGED
|
@@ -228,7 +228,6 @@ Control which matches to use:
|
|
|
228
228
|
lineEndsWith: ['\\.'],
|
|
229
229
|
split: 'after',
|
|
230
230
|
occurrence: 'last', // Only split at LAST period on page
|
|
231
|
-
maxSpan: 1, // Apply per-page
|
|
232
231
|
}
|
|
233
232
|
```
|
|
234
233
|
|
|
@@ -406,8 +405,99 @@ Key options:
|
|
|
406
405
|
- If you paste these signatures into `lineStartsWith` / `lineStartsAfter` / `template`, that’s fine: those template pattern types **auto-escape `()[]`** outside `{{tokens}}`.
|
|
407
406
|
- If you paste them into a raw `regex` rule, you may need to escape literal brackets yourself.
|
|
408
407
|
|
|
408
|
+
### Repeating Sequence Analysis (continuous text)
|
|
409
|
+
|
|
410
|
+
For texts without line breaks (continuous prose), use `analyzeRepeatingSequences()`:
|
|
411
|
+
|
|
412
|
+
```typescript
|
|
413
|
+
import { analyzeRepeatingSequences } from 'flappa-doormal';
|
|
414
|
+
|
|
415
|
+
const patterns = analyzeRepeatingSequences(pages, {
|
|
416
|
+
minElements: 2,
|
|
417
|
+
maxElements: 4,
|
|
418
|
+
minCount: 3,
|
|
419
|
+
topK: 20,
|
|
420
|
+
});
|
|
421
|
+
// [{ pattern: "{{naql}}\\s*{{harf}}", count: 42, examples: [...] }, ...]
|
|
422
|
+
```
|
|
423
|
+
|
|
424
|
+
Key options:
|
|
425
|
+
- `minElements` / `maxElements`: N-gram size range (default 1-3)
|
|
426
|
+
- `minCount`: Minimum occurrences to include (default 3)
|
|
427
|
+
- `topK`: Maximum patterns to return (default 20)
|
|
428
|
+
- `requireToken`: Only patterns containing `{{tokens}}` (default true)
|
|
429
|
+
- `normalizeArabicDiacritics`: Ignore diacritics when matching (default true)
|
|
430
|
+
|
|
431
|
+
## Analysis → Segmentation Workflow
|
|
432
|
+
|
|
433
|
+
Use analysis functions to discover patterns, then pass to `segmentPages()`.
|
|
434
|
+
|
|
435
|
+
### Example A: Continuous Text (No Punctuation)
|
|
436
|
+
|
|
437
|
+
For prose-like text without structural line breaks:
|
|
438
|
+
|
|
439
|
+
```typescript
|
|
440
|
+
import { analyzeRepeatingSequences, segmentPages, type Page } from 'flappa-doormal';
|
|
441
|
+
|
|
442
|
+
// Continuous Arabic text with narrator phrases
|
|
443
|
+
const pages: Page[] = [
|
|
444
|
+
{ id: 1, content: 'حدثنا أحمد بن محمد عن عمر قال سمعت النبي حدثنا خالد بن زيد عن علي' },
|
|
445
|
+
{ id: 2, content: 'حدثنا سعيد بن جبير عن ابن عباس أخبرنا يوسف عن أنس' },
|
|
446
|
+
];
|
|
447
|
+
|
|
448
|
+
// Step 1: Discover repeating patterns
|
|
449
|
+
const patterns = analyzeRepeatingSequences(pages, { minCount: 2, topK: 10 });
|
|
450
|
+
// [{ pattern: '{{naql}}', count: 5, examples: [...] }, ...]
|
|
451
|
+
|
|
452
|
+
// Step 2: Build rules from discovered patterns
|
|
453
|
+
const rules = patterns.filter(p => p.count >= 3).map(p => ({
|
|
454
|
+
lineStartsWith: [p.pattern],
|
|
455
|
+
split: 'at' as const,
|
|
456
|
+
fuzzy: true,
|
|
457
|
+
}));
|
|
458
|
+
|
|
459
|
+
// Step 3: Segment
|
|
460
|
+
const segments = segmentPages(pages, { rules });
|
|
461
|
+
// [{ content: 'حدثنا أحمد بن محمد عن عمر قال سمعت النبي', from: 1 }, ...]
|
|
462
|
+
```
|
|
463
|
+
|
|
464
|
+
### Example B: Structured Text (With Numbering)
|
|
465
|
+
|
|
466
|
+
For hadith-style numbered entries:
|
|
467
|
+
|
|
468
|
+
```typescript
|
|
469
|
+
import { analyzeCommonLineStarts, segmentPages, type Page } from 'flappa-doormal';
|
|
470
|
+
|
|
471
|
+
// Numbered hadith text
|
|
472
|
+
const pages: Page[] = [
|
|
473
|
+
{ id: 1, content: '٦٦٩٦ - حَدَّثَنَا أَبُو بَكْرٍ عَنِ النَّبِيِّ\n٦٦٩٧ - أَخْبَرَنَا عُمَرُ قَالَ' },
|
|
474
|
+
{ id: 2, content: '٦٦٩٨ - حَدَّثَنِي مُحَمَّدٌ عَنْ عَائِشَةَ' },
|
|
475
|
+
];
|
|
476
|
+
|
|
477
|
+
// Step 1: Discover common line-start patterns
|
|
478
|
+
const patterns = analyzeCommonLineStarts(pages, { topK: 10, minCount: 2 });
|
|
479
|
+
// [{ pattern: '{{raqms}}\\s*{{dash}}', count: 3, examples: [...] }, ...]
|
|
480
|
+
|
|
481
|
+
// Step 2: Build rules (add named capture for hadith number)
|
|
482
|
+
const topPattern = patterns[0]?.pattern ?? '{{raqms}} {{dash}} ';
|
|
483
|
+
const rules = [{
|
|
484
|
+
lineStartsAfter: [topPattern.replace('{{raqms}}', '{{raqms:num}}')],
|
|
485
|
+
split: 'at' as const,
|
|
486
|
+
meta: { type: 'hadith' }
|
|
487
|
+
}];
|
|
488
|
+
|
|
489
|
+
// Step 3: Segment
|
|
490
|
+
const segments = segmentPages(pages, { rules });
|
|
491
|
+
// [
|
|
492
|
+
// { content: 'حَدَّثَنَا أَبُو بَكْرٍ...', from: 1, meta: { type: 'hadith', num: '٦٦٩٦' } },
|
|
493
|
+
// { content: 'أَخْبَرَنَا عُمَرُ قَالَ', from: 1, meta: { type: 'hadith', num: '٦٦٩٧' } },
|
|
494
|
+
// { content: 'حَدَّثَنِي مُحَمَّدٌ...', from: 2, meta: { type: 'hadith', num: '٦٦٩٨' } },
|
|
495
|
+
// ]
|
|
496
|
+
```
|
|
497
|
+
|
|
409
498
|
## Rule Validation
|
|
410
499
|
|
|
500
|
+
|
|
411
501
|
Use `validateRules()` to detect common mistakes in rule patterns before running segmentation:
|
|
412
502
|
|
|
413
503
|
```typescript
|
|
@@ -619,32 +709,10 @@ const segments = segmentPages(pages, {
|
|
|
619
709
|
lineEndsWith: ['\\.'],
|
|
620
710
|
split: 'after',
|
|
621
711
|
occurrence: 'last',
|
|
622
|
-
maxSpan: 1
|
|
623
712
|
}]
|
|
624
713
|
});
|
|
625
714
|
```
|
|
626
715
|
|
|
627
|
-
### Page Fallback for Unmatched Content
|
|
628
|
-
|
|
629
|
-
When using `maxSpan` to group matches per page, use `fallback: 'page'` to prevent unmatched pages from merging with adjacent segments:
|
|
630
|
-
|
|
631
|
-
```typescript
|
|
632
|
-
const segments = segmentPages(pages, {
|
|
633
|
-
rules: [{
|
|
634
|
-
template: '{{tarqim}}', // Match punctuation marks
|
|
635
|
-
split: 'after',
|
|
636
|
-
occurrence: 'last',
|
|
637
|
-
maxSpan: 1,
|
|
638
|
-
fallback: 'page' // If no punctuation found, segment the page anyway
|
|
639
|
-
}]
|
|
640
|
-
});
|
|
641
|
-
```
|
|
642
|
-
|
|
643
|
-
**Without `fallback`**: Pages without matches merge into the next segment
|
|
644
|
-
**With `fallback: 'page'`**: Each page becomes its own segment even without matches
|
|
645
|
-
|
|
646
|
-
> **Future extensions**: The `fallback` option may support additional values like `'skip'` (omit unmatched content) or `'line'` (split at line breaks) in future versions.
|
|
647
|
-
|
|
648
716
|
### Multiple Rules with Priority
|
|
649
717
|
|
|
650
718
|
```typescript
|
|
@@ -697,6 +765,53 @@ const options: SegmentationOptions = {
|
|
|
697
765
|
const segments: Segment[] = segmentPages(pages, options);
|
|
698
766
|
```
|
|
699
767
|
|
|
768
|
+
### Marker recovery (when `lineStartsAfter` was used by accident)
|
|
769
|
+
|
|
770
|
+
If you accidentally used `lineStartsAfter` for markers that should have been preserved (e.g. Arabic connective phrases like `وروى` / `وذكر`), you can recover those missing prefixes from existing segments.
|
|
771
|
+
|
|
772
|
+
#### `recoverMistakenLineStartsAfterMarkers(pages, segments, options, selector, opts?)`
|
|
773
|
+
|
|
774
|
+
This function returns new segments with recovered `content` plus a `report` describing what happened.
|
|
775
|
+
|
|
776
|
+
**Recommended (deterministic) mode**: rerun segmentation with selected rules converted to `lineStartsWith`, then merge recovered content back.
|
|
777
|
+
|
|
778
|
+
```ts
|
|
779
|
+
import { recoverMistakenLineStartsAfterMarkers, segmentPages } from 'flappa-doormal';
|
|
780
|
+
|
|
781
|
+
const pages = [{ id: 1, content: 'وروى أحمد\nوذكر خالد' }];
|
|
782
|
+
const options = { rules: [{ lineStartsAfter: ['وروى '] }, { lineStartsAfter: ['وذكر '] }] };
|
|
783
|
+
|
|
784
|
+
const segments = segmentPages(pages, options);
|
|
785
|
+
// segments[0].content === 'أحمد' (marker stripped)
|
|
786
|
+
|
|
787
|
+
const { segments: recovered, report } = recoverMistakenLineStartsAfterMarkers(
|
|
788
|
+
pages,
|
|
789
|
+
segments,
|
|
790
|
+
options,
|
|
791
|
+
{ type: 'rule_indices', indices: [0] }, // recover only the first rule
|
|
792
|
+
);
|
|
793
|
+
|
|
794
|
+
// recovered[0].content === 'وروى أحمد'
|
|
795
|
+
// recovered[1].content === 'خالد' (unchanged)
|
|
796
|
+
console.log(report.summary);
|
|
797
|
+
```
|
|
798
|
+
|
|
799
|
+
**Optional**: best-effort anchoring mode attempts to recover without rerunning first, then falls back to rerun for unresolved segments:
|
|
800
|
+
|
|
801
|
+
```ts
|
|
802
|
+
const { segments: recovered } = recoverMistakenLineStartsAfterMarkers(
|
|
803
|
+
pages,
|
|
804
|
+
segments,
|
|
805
|
+
options,
|
|
806
|
+
{ type: 'rule_indices', indices: [0] },
|
|
807
|
+
{ mode: 'best_effort_then_rerun' }
|
|
808
|
+
);
|
|
809
|
+
```
|
|
810
|
+
|
|
811
|
+
Notes:
|
|
812
|
+
- Recovery is **explicitly scoped** by the `selector`; it will not “guess” which rules are mistaken.
|
|
813
|
+
- If your segments were heavily post-processed (trimmed/normalized/reordered), recovery may return unresolved items; see the report for details.
|
|
814
|
+
|
|
700
815
|
### `stripHtmlTags(html)`
|
|
701
816
|
|
|
702
817
|
Remove all HTML tags from content, keeping only text.
|
|
@@ -865,9 +980,7 @@ type SplitRule = {
|
|
|
865
980
|
// Split behavior
|
|
866
981
|
split?: 'at' | 'after'; // Default: 'at'
|
|
867
982
|
occurrence?: 'first' | 'last' | 'all';
|
|
868
|
-
maxSpan?: number;
|
|
869
983
|
fuzzy?: boolean;
|
|
870
|
-
fallback?: 'page'; // NEW: Page-boundary fallback
|
|
871
984
|
|
|
872
985
|
// Constraints
|
|
873
986
|
min?: number;
|
|
@@ -995,28 +1108,6 @@ The library concatenates all pages into a single string for pattern matching acr
|
|
|
995
1108
|
|
|
996
1109
|
For typical book processing (up to 6,000 pages), memory usage is well within Node.js defaults. For very large books (40,000+ pages), ensure adequate heap size.
|
|
997
1110
|
|
|
998
|
-
### `maxSpan` Sliding Window Behavior
|
|
999
|
-
|
|
1000
|
-
The `maxSpan` option uses a **sliding window algorithm** based on page ID difference:
|
|
1001
|
-
|
|
1002
|
-
```typescript
|
|
1003
|
-
// maxSpan = maximum page ID difference when looking ahead for split points
|
|
1004
|
-
// Algorithm prefers LONGER segments by looking as far ahead as allowed
|
|
1005
|
-
|
|
1006
|
-
// Pages [1, 2, 3, 4] with maxSpan: 1, occurrence: 'last'
|
|
1007
|
-
// Window from page 1: pages 1-2 (diff <= 1), splits at page 2's last match
|
|
1008
|
-
// Window from page 3: pages 3-4 (diff <= 1), splits at page 4's last match
|
|
1009
|
-
// Result: 2 segments spanning pages 1-2 and 3-4
|
|
1010
|
-
|
|
1011
|
-
// Pages [1, 5, 10] with maxSpan: 1, occurrence: 'last'
|
|
1012
|
-
// Window from page 1: only page 1 (5-1=4 > 1), splits at page 1
|
|
1013
|
-
// Window from page 5: only page 5 (10-5=5 > 1), splits at page 5
|
|
1014
|
-
// Window from page 10: only page 10, splits at page 10
|
|
1015
|
-
// Result: 3 segments (pages too far apart to merge)
|
|
1016
|
-
```
|
|
1017
|
-
|
|
1018
|
-
This is intentional for books where page IDs represent actual page numbers. With `occurrence: 'last'`, the algorithm finds the last match within the lookahead window, creating longer segments where possible.
|
|
1019
|
-
|
|
1020
1111
|
## For AI Agents
|
|
1021
1112
|
|
|
1022
1113
|
See [AGENTS.md](./AGENTS.md) for:
|
package/dist/index.d.mts
CHANGED
|
@@ -238,38 +238,9 @@ type SplitBehavior = {
|
|
|
238
238
|
* - `'first'`: Only split at the first match
|
|
239
239
|
* - `'last'`: Only split at the last match
|
|
240
240
|
*
|
|
241
|
-
* When `maxSpan` is set, occurrence filtering is applied per sliding
|
|
242
|
-
* window rather than globally. With `'last'`, the algorithm prefers
|
|
243
|
-
* longer segments by looking as far ahead as allowed before selecting
|
|
244
|
-
* the last match in the window.
|
|
245
|
-
*
|
|
246
241
|
* @default 'all'
|
|
247
242
|
*/
|
|
248
243
|
occurrence?: 'first' | 'last' | 'all';
|
|
249
|
-
/**
|
|
250
|
-
* Maximum page ID difference allowed when looking ahead for split points.
|
|
251
|
-
*
|
|
252
|
-
* Uses a sliding window algorithm that prefers longer segments:
|
|
253
|
-
* 1. Start from the first page of the current segment
|
|
254
|
-
* 2. Look for matches within pages where `pageId - startPageId <= maxSpan`
|
|
255
|
-
* 3. Apply occurrence filter (e.g., 'last') to select a match
|
|
256
|
-
* 4. Next window starts from the page after the match
|
|
257
|
-
*
|
|
258
|
-
* Examples:
|
|
259
|
-
* - `maxSpan: 1` = look 1 page ahead (segments span at most 2 pages)
|
|
260
|
-
* - `maxSpan: 2` = look 2 pages ahead (segments span at most 3 pages)
|
|
261
|
-
* - `undefined` = no limit (entire content treated as one group)
|
|
262
|
-
*
|
|
263
|
-
* Note: With non-consecutive page IDs, the algorithm uses actual ID
|
|
264
|
-
* difference, not array index. Pages 1 and 5 have a difference of 4.
|
|
265
|
-
*
|
|
266
|
-
* @example
|
|
267
|
-
* // Split at last period, looking up to 1 page ahead
|
|
268
|
-
* // Pages 1,2: split at page 2's last period
|
|
269
|
-
* // Page 3: split at page 3's last period
|
|
270
|
-
* { lineEndsWith: ['.'], split: 'after', occurrence: 'last', maxSpan: 1 }
|
|
271
|
-
*/
|
|
272
|
-
maxSpan?: number;
|
|
273
244
|
/**
|
|
274
245
|
* Enable diacritic-insensitive matching for Arabic text.
|
|
275
246
|
*
|
|
@@ -354,12 +325,6 @@ type RuleConstraints = {
|
|
|
354
325
|
* { lineStartsWith: ['{{bab}}'], split: 'before', meta: { type: 'chapter' } }
|
|
355
326
|
*/
|
|
356
327
|
meta?: Record<string, unknown>;
|
|
357
|
-
/**
|
|
358
|
-
* Fallback behavior when no matches are found within a maxSpan boundary.
|
|
359
|
-
* - 'page': Create split points at page boundaries
|
|
360
|
-
* - undefined: No fallback (current behavior)
|
|
361
|
-
*/
|
|
362
|
-
fallback?: 'page';
|
|
363
328
|
/**
|
|
364
329
|
* Page-start guard: only allow this rule to match at the START of a page if the
|
|
365
330
|
* previous page's last non-whitespace character matches this pattern.
|
|
@@ -388,7 +353,7 @@ type RuleConstraints = {
|
|
|
388
353
|
* Each rule must specify:
|
|
389
354
|
* - **Pattern** (exactly one): `regex`, `template`, `lineStartsWith`,
|
|
390
355
|
* `lineStartsAfter`, or `lineEndsWith`
|
|
391
|
-
* - **Split behavior**: `split` (optional, defaults to `'at'`), `occurrence`, `
|
|
356
|
+
* - **Split behavior**: `split` (optional, defaults to `'at'`), `occurrence`, `fuzzy`
|
|
392
357
|
* - **Constraints** (optional): `min`, `max`, `meta`
|
|
393
358
|
*
|
|
394
359
|
* @example
|
|
@@ -424,7 +389,6 @@ type SplitRule = PatternType & SplitBehavior & RuleConstraints;
|
|
|
424
389
|
type Page = {
|
|
425
390
|
/**
|
|
426
391
|
* Unique page/entry ID used for:
|
|
427
|
-
* - `maxSpan` grouping (segments spanning multiple pages)
|
|
428
392
|
* - `min`/`max` constraint filtering
|
|
429
393
|
* - `from`/`to` tracking in output segments
|
|
430
394
|
*/
|
|
@@ -625,6 +589,21 @@ type SegmentationOptions = {
|
|
|
625
589
|
* rule's metadata is used for each segment.
|
|
626
590
|
*/
|
|
627
591
|
rules?: SplitRule[];
|
|
592
|
+
/**
|
|
593
|
+
* Attach debugging provenance into `segment.meta` indicating which rule and/or breakpoint
|
|
594
|
+
* created the segment boundary.
|
|
595
|
+
*
|
|
596
|
+
* This is opt-in because it increases output size.
|
|
597
|
+
*
|
|
598
|
+
* When enabled (default metaKey: `_flappa`), segments may include:
|
|
599
|
+
* `meta._flappa.rule` and/or `meta._flappa.breakpoint`.
|
|
600
|
+
*/
|
|
601
|
+
debug?: boolean | {
|
|
602
|
+
/** Where to store provenance in meta. @default '_flappa' */
|
|
603
|
+
metaKey?: string;
|
|
604
|
+
/** Which kinds of provenance to include. @default ['rule','breakpoint'] */
|
|
605
|
+
include?: Array<'rule' | 'breakpoint'>;
|
|
606
|
+
};
|
|
628
607
|
/**
|
|
629
608
|
* Maximum pages per segment before breakpoints are applied.
|
|
630
609
|
*
|
|
@@ -770,7 +749,7 @@ type Segment = {
|
|
|
770
749
|
/**
|
|
771
750
|
* Types of validation issues that can be detected.
|
|
772
751
|
*/
|
|
773
|
-
type ValidationIssueType = 'missing_braces' | 'unknown_token' | 'duplicate';
|
|
752
|
+
type ValidationIssueType = 'missing_braces' | 'unknown_token' | 'duplicate' | 'empty_pattern';
|
|
774
753
|
/**
|
|
775
754
|
* A validation issue found in a pattern.
|
|
776
755
|
*/
|
|
@@ -1149,70 +1128,18 @@ declare const getAvailableTokens: () => string[];
|
|
|
1149
1128
|
*/
|
|
1150
1129
|
declare const getTokenPattern: (tokenName: string) => string | undefined;
|
|
1151
1130
|
//#endregion
|
|
1152
|
-
//#region src/analysis.d.ts
|
|
1131
|
+
//#region src/analysis/line-starts.d.ts
|
|
1153
1132
|
type LineStartAnalysisOptions = {
|
|
1154
|
-
/** Return top K patterns (after filtering). Default: 20 */
|
|
1155
1133
|
topK?: number;
|
|
1156
|
-
/** Only consider the first N characters of each trimmed line. Default: 60 */
|
|
1157
1134
|
prefixChars?: number;
|
|
1158
|
-
/** Ignore lines shorter than this (after trimming). Default: 6 */
|
|
1159
1135
|
minLineLength?: number;
|
|
1160
|
-
/** Only include patterns that appear at least this many times. Default: 3 */
|
|
1161
1136
|
minCount?: number;
|
|
1162
|
-
/** Keep up to this many example lines per pattern. Default: 5 */
|
|
1163
1137
|
maxExamples?: number;
|
|
1164
|
-
/**
|
|
1165
|
-
* If true, include a literal first word when no token match is found at the start.
|
|
1166
|
-
* Default: true
|
|
1167
|
-
*/
|
|
1168
1138
|
includeFirstWordFallback?: boolean;
|
|
1169
|
-
/**
|
|
1170
|
-
* If true, strip Arabic diacritics (harakat/tashkeel) for the purposes of matching tokens.
|
|
1171
|
-
* This helps patterns like `وأَخْبَرَنَا` match the `{{naql}}` token (`وأخبرنا`).
|
|
1172
|
-
*
|
|
1173
|
-
* Note: examples are still stored in their original (unstripped) form.
|
|
1174
|
-
*
|
|
1175
|
-
* Default: true
|
|
1176
|
-
*/
|
|
1177
1139
|
normalizeArabicDiacritics?: boolean;
|
|
1178
|
-
/**
|
|
1179
|
-
* How to sort patterns before applying `topK`.
|
|
1180
|
-
*
|
|
1181
|
-
* - `specificity` (default): prioritize more structured prefixes first (tokenCount, then literalLen), then count.
|
|
1182
|
-
* - `count`: prioritize highest-frequency patterns first, then specificity.
|
|
1183
|
-
*/
|
|
1184
1140
|
sortBy?: 'specificity' | 'count';
|
|
1185
|
-
/**
|
|
1186
|
-
* Optional filter to restrict which lines are analyzed.
|
|
1187
|
-
*
|
|
1188
|
-
* The `line` argument is the trimmed + whitespace-collapsed version of the line.
|
|
1189
|
-
* Return `true` to include it, `false` to skip it.
|
|
1190
|
-
*
|
|
1191
|
-
* @example
|
|
1192
|
-
* // Only analyze markdown H2 headings
|
|
1193
|
-
* { lineFilter: (line) => line.startsWith('## ') }
|
|
1194
|
-
*/
|
|
1195
1141
|
lineFilter?: (line: string, pageId: number) => boolean;
|
|
1196
|
-
/**
|
|
1197
|
-
* Optional list of prefix matchers to consume before tokenization.
|
|
1198
|
-
*
|
|
1199
|
-
* This is for "syntactic" prefixes that are common at line start but are not
|
|
1200
|
-
* meaningful as tokens by themselves (e.g. markdown headings like `##`).
|
|
1201
|
-
*
|
|
1202
|
-
* Each matcher is applied at the current position. If it matches, the matched
|
|
1203
|
-
* text is appended (escaped) to the signature and the scanner advances.
|
|
1204
|
-
*
|
|
1205
|
-
* @example
|
|
1206
|
-
* // Support markdown blockquotes and headings
|
|
1207
|
-
* { prefixMatchers: [/^>+/u, /^#+/u] }
|
|
1208
|
-
*/
|
|
1209
1142
|
prefixMatchers?: RegExp[];
|
|
1210
|
-
/**
|
|
1211
|
-
* How to represent whitespace in returned `pattern` signatures.
|
|
1212
|
-
*
|
|
1213
|
-
* - `regex` (default): use `\\s*` placeholders between tokens (useful if you paste patterns into regex-ish templates).
|
|
1214
|
-
* - `space`: use literal single spaces (`' '`) between tokens (safer if you don't want `\\s` to match newlines when reused as regex).
|
|
1215
|
-
*/
|
|
1216
1143
|
whitespace?: 'regex' | 'space';
|
|
1217
1144
|
};
|
|
1218
1145
|
type LineStartPatternExample = {
|
|
@@ -1226,12 +1153,41 @@ type CommonLineStartPattern = {
|
|
|
1226
1153
|
};
|
|
1227
1154
|
/**
|
|
1228
1155
|
* Analyze pages and return the most common line-start patterns (top K).
|
|
1229
|
-
*
|
|
1230
|
-
* This is a pure algorithmic heuristic: it tokenizes common prefixes into a stable
|
|
1231
|
-
* template-ish string using the library tokens (e.g., `{{bab}}`, `{{raqms}}`, `{{rumuz}}`).
|
|
1232
1156
|
*/
|
|
1233
1157
|
declare const analyzeCommonLineStarts: (pages: Page[], options?: LineStartAnalysisOptions) => CommonLineStartPattern[];
|
|
1234
1158
|
//#endregion
|
|
1159
|
+
//#region src/analysis/repeating-sequences.d.ts
|
|
1160
|
+
type RepeatingSequenceOptions = {
|
|
1161
|
+
minElements?: number;
|
|
1162
|
+
maxElements?: number;
|
|
1163
|
+
minCount?: number;
|
|
1164
|
+
topK?: number;
|
|
1165
|
+
normalizeArabicDiacritics?: boolean;
|
|
1166
|
+
requireToken?: boolean;
|
|
1167
|
+
whitespace?: 'regex' | 'space';
|
|
1168
|
+
maxExamples?: number;
|
|
1169
|
+
contextChars?: number;
|
|
1170
|
+
maxUniquePatterns?: number;
|
|
1171
|
+
};
|
|
1172
|
+
type RepeatingSequenceExample = {
|
|
1173
|
+
text: string;
|
|
1174
|
+
context: string;
|
|
1175
|
+
pageId: number;
|
|
1176
|
+
startIndices: number[];
|
|
1177
|
+
};
|
|
1178
|
+
type RepeatingSequencePattern = {
|
|
1179
|
+
pattern: string;
|
|
1180
|
+
count: number;
|
|
1181
|
+
examples: RepeatingSequenceExample[];
|
|
1182
|
+
};
|
|
1183
|
+
/**
|
|
1184
|
+
* Analyze pages for commonly repeating word sequences.
|
|
1185
|
+
*
|
|
1186
|
+
* Use for continuous text without line breaks. For line-based analysis,
|
|
1187
|
+
* use `analyzeCommonLineStarts()` instead.
|
|
1188
|
+
*/
|
|
1189
|
+
declare const analyzeRepeatingSequences: (pages: Page[], options?: RepeatingSequenceOptions) => RepeatingSequencePattern[];
|
|
1190
|
+
//#endregion
|
|
1235
1191
|
//#region src/detection.d.ts
|
|
1236
1192
|
/**
|
|
1237
1193
|
* Pattern detection utilities for recognizing template tokens in Arabic text.
|
|
@@ -1307,5 +1263,67 @@ declare const analyzeTextForRule: (text: string) => {
|
|
|
1307
1263
|
detected: DetectedPattern[];
|
|
1308
1264
|
} | null;
|
|
1309
1265
|
//#endregion
|
|
1310
|
-
|
|
1266
|
+
//#region src/recovery.d.ts
|
|
1267
|
+
type MarkerRecoverySelector = {
|
|
1268
|
+
type: 'rule_indices';
|
|
1269
|
+
indices: number[];
|
|
1270
|
+
} | {
|
|
1271
|
+
type: 'lineStartsAfter_patterns';
|
|
1272
|
+
match?: 'exact' | 'normalized';
|
|
1273
|
+
patterns: string[];
|
|
1274
|
+
} | {
|
|
1275
|
+
type: 'predicate';
|
|
1276
|
+
predicate: (rule: SplitRule, index: number) => boolean;
|
|
1277
|
+
};
|
|
1278
|
+
type MarkerRecoveryRun = {
|
|
1279
|
+
options: SegmentationOptions;
|
|
1280
|
+
pages: Page[];
|
|
1281
|
+
segments: Segment[];
|
|
1282
|
+
selector: MarkerRecoverySelector;
|
|
1283
|
+
};
|
|
1284
|
+
type MarkerRecoveryReport = {
|
|
1285
|
+
summary: {
|
|
1286
|
+
mode: 'rerun_only' | 'best_effort_then_rerun';
|
|
1287
|
+
recovered: number;
|
|
1288
|
+
totalSegments: number;
|
|
1289
|
+
unchanged: number;
|
|
1290
|
+
unresolved: number;
|
|
1291
|
+
};
|
|
1292
|
+
byRun?: Array<{
|
|
1293
|
+
recovered: number;
|
|
1294
|
+
runIndex: number;
|
|
1295
|
+
totalSegments: number;
|
|
1296
|
+
unresolved: number;
|
|
1297
|
+
}>;
|
|
1298
|
+
details: Array<{
|
|
1299
|
+
from: number;
|
|
1300
|
+
originalStartPreview: string;
|
|
1301
|
+
recoveredPrefixPreview?: string;
|
|
1302
|
+
recoveredStartPreview?: string;
|
|
1303
|
+
segmentIndex: number;
|
|
1304
|
+
status: 'recovered' | 'skipped_idempotent' | 'unchanged' | 'unresolved_alignment' | 'unresolved_selector';
|
|
1305
|
+
strategy: 'rerun' | 'stage1' | 'none';
|
|
1306
|
+
to?: number;
|
|
1307
|
+
notes?: string[];
|
|
1308
|
+
}>;
|
|
1309
|
+
errors: string[];
|
|
1310
|
+
warnings: string[];
|
|
1311
|
+
};
|
|
1312
|
+
type NormalizeCompareMode = 'none' | 'whitespace' | 'whitespace_and_nfkc';
|
|
1313
|
+
declare function recoverMistakenLineStartsAfterMarkers(pages: Page[], segments: Segment[], options: SegmentationOptions, selector: MarkerRecoverySelector, opts?: {
|
|
1314
|
+
mode?: 'rerun_only' | 'best_effort_then_rerun';
|
|
1315
|
+
normalizeCompare?: NormalizeCompareMode;
|
|
1316
|
+
}): {
|
|
1317
|
+
report: MarkerRecoveryReport;
|
|
1318
|
+
segments: Segment[];
|
|
1319
|
+
};
|
|
1320
|
+
declare function recoverMistakenMarkersForRuns(runs: MarkerRecoveryRun[], opts?: {
|
|
1321
|
+
mode?: 'rerun_only' | 'best_effort_then_rerun';
|
|
1322
|
+
normalizeCompare?: NormalizeCompareMode;
|
|
1323
|
+
}): {
|
|
1324
|
+
report: MarkerRecoveryReport;
|
|
1325
|
+
segments: Segment[];
|
|
1326
|
+
};
|
|
1327
|
+
//#endregion
|
|
1328
|
+
export { type Breakpoint, type BreakpointRule, type CommonLineStartPattern, type DetectedPattern, type ExpandResult, type LineStartAnalysisOptions, type LineStartPatternExample, type Logger, type MarkerRecoveryReport, type MarkerRecoveryRun, type MarkerRecoverySelector, type Page, type PageRange, type RepeatingSequenceExample, type RepeatingSequenceOptions, type RepeatingSequencePattern, type ReplaceRule, type RuleValidationResult, type Segment, type SegmentationOptions, type SplitRule, TOKEN_PATTERNS, type ValidationIssue, type ValidationIssueType, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyReplacements, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, recoverMistakenLineStartsAfterMarkers, recoverMistakenMarkersForRuns, segmentPages, suggestPatternConfig, templateToRegex, validateRules };
|
|
1311
1329
|
//# sourceMappingURL=index.d.mts.map
|