flappa-doormal 2.3.1 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +55 -0
- package/README.md +260 -5
- package/dist/index.d.mts +106 -22
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +657 -312
- package/dist/index.mjs.map +1 -1
- package/package.json +6 -5
package/AGENTS.md
CHANGED
|
@@ -379,6 +379,8 @@ bunx biome lint .
|
|
|
379
379
|
| `{{raqms}}` | Multiple Arabic-Indic numerals | ٧٥٦٣ |
|
|
380
380
|
| `{{raqms:num}}` | Numerals with named capture | `meta.num = "٧٥٦٣"` |
|
|
381
381
|
| `{{dash}}` | Various dash characters | - – — ـ |
|
|
382
|
+
| `{{harfs}}` | Single-letter codes separated by spaces | `د ت س ي ق` |
|
|
383
|
+
| `{{rumuz}}` | rijāl/takhrīj source abbreviations (matches blocks like `خت ٤`, `خ سي`) | `خت ٤` |
|
|
382
384
|
| `{{numbered}}` | Composite: `{{raqms}} {{dash}}` | ٧٥٦٣ - |
|
|
383
385
|
|
|
384
386
|
**Named captures**: Add `:name` suffix to capture into `meta`:
|
|
@@ -387,3 +389,56 @@ bunx biome lint .
|
|
|
387
389
|
// → segment.meta.hadithNum = "٧٥٦٣"
|
|
388
390
|
```
|
|
389
391
|
|
|
392
|
+
## Page-start Guard (`pageStartGuard`)
|
|
393
|
+
|
|
394
|
+
Some books contain page-wrap continuations where a new page starts with a common line-start marker (e.g. `{{naql}}`) but it is not a true new segment.
|
|
395
|
+
|
|
396
|
+
Use `pageStartGuard` on a rule to allow matches at the start of a page **only if** the previous page’s last non-whitespace character matches a pattern (tokens supported):
|
|
397
|
+
|
|
398
|
+
```typescript
|
|
399
|
+
{
|
|
400
|
+
fuzzy: true,
|
|
401
|
+
lineStartsWith: ['{{naql}}'],
|
|
402
|
+
split: 'at',
|
|
403
|
+
pageStartGuard: '{{tarqim}}'
|
|
404
|
+
}
|
|
405
|
+
```
|
|
406
|
+
|
|
407
|
+
Notes:
|
|
408
|
+
- Applies only at page starts; mid-page line starts are unaffected.
|
|
409
|
+
- Implemented in `src/segmentation/segmenter.ts` match filtering.
|
|
410
|
+
|
|
411
|
+
## Analysis Helper (`analyzeCommonLineStarts`)
|
|
412
|
+
|
|
413
|
+
`analyzeCommonLineStarts(pages)` scans lines across pages and returns common template-like line-start signatures (tokenized with `TOKEN_PATTERNS`). It’s intended to help you quickly discover rule candidates without using an LLM.
|
|
414
|
+
|
|
415
|
+
Useful options (recent additions):
|
|
416
|
+
- **`sortBy`**: `'specificity'` (default) or `'count'` (highest-frequency first). `topK` is applied **after** sorting.
|
|
417
|
+
- **`lineFilter`**: restrict which lines are analyzed (e.g. only Markdown headings).
|
|
418
|
+
- **`prefixMatchers`**: consume syntactic prefixes before tokenization (default includes headings via `/^#+/u`).
|
|
419
|
+
- This is how you see variations *after* prefixes like `##` instead of collapsing to just `"##"`.
|
|
420
|
+
- **`normalizeArabicDiacritics`**: `true` by default so tokens match diacritized forms (e.g. `وأَخْبَرَنَا` → `{{naql}}`).
|
|
421
|
+
- **`whitespace`**: `'regex'` (default) uses `\\s*` placeholders; `'space'` uses literal spaces in returned signatures.
|
|
422
|
+
|
|
423
|
+
Examples:
|
|
424
|
+
|
|
425
|
+
```typescript
|
|
426
|
+
import { analyzeCommonLineStarts } from 'flappa-doormal';
|
|
427
|
+
|
|
428
|
+
// Top 20 by frequency
|
|
429
|
+
const top20 = analyzeCommonLineStarts(pages, { sortBy: 'count', topK: 20 });
|
|
430
|
+
|
|
431
|
+
// Only headings (## / ### / ...)
|
|
432
|
+
const headings = analyzeCommonLineStarts(pages, {
|
|
433
|
+
lineFilter: (line) => line.startsWith('#'),
|
|
434
|
+
sortBy: 'count',
|
|
435
|
+
});
|
|
436
|
+
|
|
437
|
+
// Custom prefixes (e.g. blockquotes + headings)
|
|
438
|
+
const quoted = analyzeCommonLineStarts(pages, {
|
|
439
|
+
lineFilter: (line) => line.startsWith('>') || line.startsWith('#'),
|
|
440
|
+
prefixMatchers: [/^>+/u, /^#+/u],
|
|
441
|
+
sortBy: 'count',
|
|
442
|
+
});
|
|
443
|
+
```
|
|
444
|
+
|
package/README.md
CHANGED
|
@@ -90,7 +90,8 @@ Replace regex with readable tokens:
|
|
|
90
90
|
| `{{raqm}}` | Single Arabic digit | `[\\u0660-\\u0669]` |
|
|
91
91
|
| `{{dash}}` | Dash variants | `[-–—ـ]` |
|
|
92
92
|
| `{{harf}}` | Arabic letter | `[أ-ي]` |
|
|
93
|
-
| `{{harfs}}` |
|
|
93
|
+
| `{{harfs}}` | Single-letter codes separated by spaces | `[أ-ي](?:\s+[أ-ي])*` |
|
|
94
|
+
| `{{rumuz}}` | Source abbreviations (rijāl/takhrīj rumuz), incl. multi-code blocks | e.g. `خت ٤`, `خ سي`, `خ فق`, `د ت سي ق` |
|
|
94
95
|
| `{{numbered}}` | Hadith numbering `٢٢ - ` | `{{raqms}} {{dash}} ` |
|
|
95
96
|
| `{{fasl}}` | Section markers | `فصل\|مسألة` |
|
|
96
97
|
| `{{tarqim}}` | Punctuation marks | `[.!?؟؛]` |
|
|
@@ -144,6 +145,26 @@ const rules = [{
|
|
|
144
145
|
| `template` | Depends | Custom pattern with full control |
|
|
145
146
|
| `regex` | Depends | Raw regex for complex cases |
|
|
146
147
|
|
|
148
|
+
### 4.1 Page-start Guard (avoid page-wrap false positives)
|
|
149
|
+
|
|
150
|
+
When matching at line starts (e.g., `{{naql}}`), a new page can begin with a marker that is actually a **continuation** of the previous page (page wrap), not a true new segment.
|
|
151
|
+
|
|
152
|
+
Use `pageStartGuard` to allow a rule to match at the start of a page **only if** the previous page’s last non-whitespace character matches a pattern (tokens supported):
|
|
153
|
+
|
|
154
|
+
```typescript
|
|
155
|
+
const segments = segmentPages(pages, {
|
|
156
|
+
rules: [{
|
|
157
|
+
fuzzy: true,
|
|
158
|
+
lineStartsWith: ['{{naql}}'],
|
|
159
|
+
split: 'at',
|
|
160
|
+
// Only allow a split at the start of a new page if the previous page ended with sentence punctuation:
|
|
161
|
+
pageStartGuard: '{{tarqim}}'
|
|
162
|
+
}]
|
|
163
|
+
});
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
This guard applies **only at page starts**. Mid-page line starts are unaffected.
|
|
167
|
+
|
|
147
168
|
### 5. Auto-Escaping Brackets
|
|
148
169
|
|
|
149
170
|
In `lineStartsWith`, `lineStartsAfter`, `lineEndsWith`, and `template` patterns, parentheses `()` and square brackets `[]` are **automatically escaped**. This means you can write intuitive patterns without manual escaping:
|
|
@@ -296,19 +317,253 @@ const segments = segmentPages(pages, {
|
|
|
296
317
|
|
|
297
318
|
### Narrator Abbreviation Codes
|
|
298
319
|
|
|
299
|
-
Use `{{
|
|
320
|
+
Use `{{rumuz}}` for matching rijāl/takhrīj source abbreviations (common in narrator biography books and takhrīj notes):
|
|
300
321
|
|
|
301
322
|
```typescript
|
|
302
323
|
const segments = segmentPages(pages, {
|
|
303
324
|
rules: [{
|
|
304
|
-
lineStartsAfter: ['{{raqms:num}} {{
|
|
325
|
+
lineStartsAfter: ['{{raqms:num}} {{rumuz}}:'],
|
|
305
326
|
split: 'at'
|
|
306
327
|
}]
|
|
307
328
|
});
|
|
308
329
|
|
|
309
|
-
// Matches: ١١١٨
|
|
330
|
+
// Matches: ١١١٨ ع: ... / ١١١٨ خ سي: ... / ١١١٨ خ فق: ...
|
|
310
331
|
// meta: { num: '١١١٨' }
|
|
311
|
-
// content: '
|
|
332
|
+
// content: '...' (rumuz stripped)
|
|
333
|
+
```
|
|
334
|
+
|
|
335
|
+
If your data uses *only single-letter codes separated by spaces* (e.g., `د ت س ي ق`), you can also use `{{harfs}}`.
|
|
336
|
+
|
|
337
|
+
## Analysis Helpers (no LLM required)
|
|
338
|
+
|
|
339
|
+
Use `analyzeCommonLineStarts(pages)` to discover common line-start signatures across a book, useful for rule authoring:
|
|
340
|
+
|
|
341
|
+
```typescript
|
|
342
|
+
import { analyzeCommonLineStarts } from 'flappa-doormal';
|
|
343
|
+
|
|
344
|
+
const patterns = analyzeCommonLineStarts(pages);
|
|
345
|
+
// [{ pattern: "{{numbered}}", count: 1234, examples: [...] }, ...]
|
|
346
|
+
```
|
|
347
|
+
|
|
348
|
+
You can control **what gets analyzed** and **how results are ranked**:
|
|
349
|
+
|
|
350
|
+
```typescript
|
|
351
|
+
import { analyzeCommonLineStarts } from 'flappa-doormal';
|
|
352
|
+
|
|
353
|
+
// Top 20 most common line-start signatures (by frequency)
|
|
354
|
+
const topByCount = analyzeCommonLineStarts(pages, {
|
|
355
|
+
sortBy: 'count',
|
|
356
|
+
topK: 20,
|
|
357
|
+
});
|
|
358
|
+
|
|
359
|
+
// Only analyze markdown H2 headings (lines beginning with "##")
|
|
360
|
+
// This shows what comes AFTER the heading marker (e.g. "## {{bab}}", "## {{numbered}}\\[", etc.)
|
|
361
|
+
const headingVariants = analyzeCommonLineStarts(pages, {
|
|
362
|
+
lineFilter: (line) => line.startsWith('##'),
|
|
363
|
+
sortBy: 'count',
|
|
364
|
+
topK: 40,
|
|
365
|
+
});
|
|
366
|
+
|
|
367
|
+
// Support additional prefix styles without changing library code
|
|
368
|
+
// (e.g. markdown blockquotes ">> ..." + headings)
|
|
369
|
+
const quotedHeadings = analyzeCommonLineStarts(pages, {
|
|
370
|
+
lineFilter: (line) => line.startsWith('>') || line.startsWith('#'),
|
|
371
|
+
prefixMatchers: [/^>+/u, /^#+/u],
|
|
372
|
+
sortBy: 'count',
|
|
373
|
+
topK: 40,
|
|
374
|
+
});
|
|
375
|
+
```
|
|
376
|
+
|
|
377
|
+
Key options:
|
|
378
|
+
- `sortBy`: `'specificity'` (default) or `'count'` (highest frequency first)
|
|
379
|
+
- `lineFilter`: restrict which lines are counted (e.g. only headings)
|
|
380
|
+
- `prefixMatchers`: consume syntactic prefixes (default includes headings via `/^#+/u`) so you can see variations *after* the prefix
|
|
381
|
+
- `normalizeArabicDiacritics`: `true` by default (helps token matching like `وأَخْبَرَنَا` → `{{naql}}`)
|
|
382
|
+
- `whitespace`: how whitespace is represented in returned patterns:
|
|
383
|
+
- `'regex'` (default): uses `\\s*` placeholders between tokens
|
|
384
|
+
- `'space'`: uses literal single spaces (`' '`) between tokens (useful if you don't want `\\s` to later match newlines when reusing these patterns)
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
## Prompting LLMs / Agents to Generate Rules (Shamela books)
|
|
388
|
+
|
|
389
|
+
### Pre-analysis (no LLM required): generate “hints” from the book
|
|
390
|
+
|
|
391
|
+
Before prompting an LLM, you can quickly extract **high-signal pattern hints** from the book using:
|
|
392
|
+
- `analyzeCommonLineStarts(pages, options)` (from `src/line-start-analysis.ts`): common **line-start signatures** (tokenized)
|
|
393
|
+
- `analyzeTextForRule(text)` / `detectTokenPatterns(text)` (from `src/pattern-detection.ts`): turn a **single representative line** into a token template suggestion
|
|
394
|
+
|
|
395
|
+
These help the LLM avoid guessing and focus on the patterns actually present.
|
|
396
|
+
|
|
397
|
+
#### Step 1: top line-start signatures (frequency-first)
|
|
398
|
+
|
|
399
|
+
```typescript
|
|
400
|
+
import { analyzeCommonLineStarts } from 'flappa-doormal';
|
|
401
|
+
|
|
402
|
+
const top = analyzeCommonLineStarts(pages, {
|
|
403
|
+
sortBy: 'count',
|
|
404
|
+
topK: 40,
|
|
405
|
+
minCount: 10,
|
|
406
|
+
});
|
|
407
|
+
|
|
408
|
+
console.log(top.map((p) => ({ pattern: p.pattern, count: p.count, example: p.examples[0] })));
|
|
409
|
+
```
|
|
410
|
+
|
|
411
|
+
Typical output (example):
|
|
412
|
+
|
|
413
|
+
```text
|
|
414
|
+
[
|
|
415
|
+
{ pattern: "{{numbered}}", count: 1200, example: { pageId: 50, line: "١ - حَدَّثَنَا ..." } },
|
|
416
|
+
{ pattern: "{{bab}}", count: 180, example: { pageId: 66, line: "باب ..." } },
|
|
417
|
+
{ pattern: "##\\s*{{bab}}",count: 140, example: { pageId: 69, line: "## باب ..." } }
|
|
418
|
+
]
|
|
419
|
+
```
|
|
420
|
+
|
|
421
|
+
If you only want to analyze headings (to see what comes *after* `##`):
|
|
422
|
+
|
|
423
|
+
```typescript
|
|
424
|
+
const headingVariants = analyzeCommonLineStarts(pages, {
|
|
425
|
+
lineFilter: (line) => line.startsWith('##'),
|
|
426
|
+
sortBy: 'count',
|
|
427
|
+
topK: 40,
|
|
428
|
+
});
|
|
429
|
+
```
|
|
430
|
+
|
|
431
|
+
#### Step 2: convert a few representative lines into token templates
|
|
432
|
+
|
|
433
|
+
Pick 3–10 representative line prefixes from the book (often from the examples returned above) and run:
|
|
434
|
+
|
|
435
|
+
```typescript
|
|
436
|
+
import { analyzeTextForRule } from 'flappa-doormal';
|
|
437
|
+
|
|
438
|
+
console.log(analyzeTextForRule("٢٩- خ سي: أحمد بن حميد ..."));
|
|
439
|
+
// -> { template: "{{raqms}}- {{rumuz}}: أحمد...", patternType: "lineStartsAfter", fuzzy: false, ... }
|
|
440
|
+
```
|
|
441
|
+
|
|
442
|
+
#### Step 3: paste the “hints” into your LLM prompt
|
|
443
|
+
|
|
444
|
+
When you prompt the LLM, include a short “Hints” section:
|
|
445
|
+
- Top 20–50 `analyzeCommonLineStarts` patterns (with counts + 1–2 examples)
|
|
446
|
+
- 3–10 `analyzeTextForRule(...)` results
|
|
447
|
+
- A small sample of pages (not the full book)
|
|
448
|
+
|
|
449
|
+
Then instruct the LLM to **prioritize rules that align with those hints**.
|
|
450
|
+
|
|
451
|
+
You can use an LLM to generate `SegmentationOptions` by pasting it a random subset of pages and asking it to infer robust segmentation rules. Here’s a ready-to-copy plain-text prompt:
|
|
452
|
+
|
|
453
|
+
```text
|
|
454
|
+
You are helping me generate JSON configuration for a text-segmentation function called segmentPages(pages, options).
|
|
455
|
+
It segments Arabic book pages (e.g., Shamela) into logical segments (books/chapters/sections/entries/hadiths).
|
|
456
|
+
|
|
457
|
+
I will give you a random subset of pages so you can infer patterns. You must respond with ONLY JSON (no prose).
|
|
458
|
+
|
|
459
|
+
I will paste a random subset of pages. Each page has:
|
|
460
|
+
- id: page number (not necessarily consecutive)
|
|
461
|
+
- content: plain text; line breaks are \n
|
|
462
|
+
|
|
463
|
+
Output ONLY a JSON object compatible with SegmentationOptions (no prose, no code fences).
|
|
464
|
+
|
|
465
|
+
SegmentationOptions shape:
|
|
466
|
+
- rules: SplitRule[]
|
|
467
|
+
- optional: maxPages, breakpoints, prefer
|
|
468
|
+
|
|
469
|
+
SplitRule constraints:
|
|
470
|
+
- Each rule must use exactly ONE of: lineStartsWith, lineStartsAfter, lineEndsWith, template, regex
|
|
471
|
+
- Optional fields: split ("at" | "after"), meta, min, max, exclude, occurrence ("first" | "last"), fuzzy
|
|
472
|
+
|
|
473
|
+
Important behaviors:
|
|
474
|
+
- lineStartsAfter matches at line start but strips the marker from segment.content.
|
|
475
|
+
- Template patterns (lineStartsWith/After/EndsWith/template) auto-escape ()[] outside tokens.
|
|
476
|
+
- Raw regex patterns do NOT auto-escape and can include groups, named captures, etc.
|
|
477
|
+
|
|
478
|
+
Available tokens you may use in templates:
|
|
479
|
+
- {{basmalah}} (بسم الله / ﷽)
|
|
480
|
+
- {{kitab}} (كتاب)
|
|
481
|
+
- {{bab}} (باب)
|
|
482
|
+
- {{fasl}} (فصل | مسألة)
|
|
483
|
+
- {{naql}} (حدثنا/أخبرنا/... narration phrases)
|
|
484
|
+
- {{raqm}} (single Arabic-Indic digit)
|
|
485
|
+
- {{raqms}} (Arabic-Indic digits)
|
|
486
|
+
- {{dash}} (dash variants)
|
|
487
|
+
- {{tarqim}} (punctuation [. ! ? ؟ ؛])
|
|
488
|
+
- {{harf}} (Arabic letter)
|
|
489
|
+
- {{harfs}} (single-letter codes separated by spaces; e.g. "د ت س ي ق")
|
|
490
|
+
- {{rumuz}} (rijāl/takhrīj source abbreviations; matches blocks like "خت ٤", "خ سي", "خ فق")
|
|
491
|
+
|
|
492
|
+
Named captures:
|
|
493
|
+
- {{raqms:num}} captures to meta.num
|
|
494
|
+
- {{:name}} captures arbitrary text to meta.name
|
|
495
|
+
|
|
496
|
+
Your tasks:
|
|
497
|
+
1) Identify document structure from the sample:
|
|
498
|
+
- book headers (كتاب), chapter headers (باب), sections (فصل/مسألة), hadith numbering, biography entries, etc.
|
|
499
|
+
2) Propose a minimal but robust ordered ruleset:
|
|
500
|
+
- Put most-specific rules first.
|
|
501
|
+
- Use fuzzy:true for Arabic headings where diacritics vary.
|
|
502
|
+
- Use lineStartsAfter when you want to remove the marker (e.g., hadith numbers, rumuz prefixes).
|
|
503
|
+
3) Use constraints:
|
|
504
|
+
- Use min/max/exclude when front matter differs or specific pages are noisy.
|
|
505
|
+
4) If segments can span many pages:
|
|
506
|
+
- Set maxPages and breakpoints.
|
|
507
|
+
- Suggested breakpoints (in order): "{{tarqim}}\\s*", "\\n", "" (page boundary)
|
|
508
|
+
- Prefer "longer" unless there’s a reason to prefer shorter segments.
|
|
509
|
+
5) Capture useful metadata:
|
|
510
|
+
- For numbering patterns, capture the number into meta.num (e.g., {{raqms:num}}).
|
|
511
|
+
|
|
512
|
+
Examples (what good answers look like):
|
|
513
|
+
|
|
514
|
+
Example A: hadith-style numbered segments
|
|
515
|
+
Input pages:
|
|
516
|
+
PAGE 10:
|
|
517
|
+
٣٤ - حَدَّثَنَا ...\n... (rest of hadith)
|
|
518
|
+
PAGE 11:
|
|
519
|
+
٣٥ - حَدَّثَنَا ...\n... (rest of hadith)
|
|
520
|
+
|
|
521
|
+
Good JSON answer:
|
|
522
|
+
{
|
|
523
|
+
"rules": [
|
|
524
|
+
{
|
|
525
|
+
"lineStartsAfter": ["{{raqms:num}} {{dash}}\\s*"],
|
|
526
|
+
"split": "at",
|
|
527
|
+
"meta": { "type": "hadith" }
|
|
528
|
+
}
|
|
529
|
+
]
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
Example B: chapter markers + hadith numbers
|
|
533
|
+
Input pages:
|
|
534
|
+
PAGE 50:
|
|
535
|
+
كتاب الصلاة\nباب فضل الصلاة\n١ - حَدَّثَنَا ...\n...
|
|
536
|
+
PAGE 51:
|
|
537
|
+
٢ - حَدَّثَنَا ...\n...
|
|
538
|
+
|
|
539
|
+
Good JSON answer:
|
|
540
|
+
{
|
|
541
|
+
"rules": [
|
|
542
|
+
{ "fuzzy": true, "lineStartsWith": ["{{kitab}}"], "split": "at", "meta": { "type": "book" } },
|
|
543
|
+
{ "fuzzy": true, "lineStartsWith": ["{{bab}}"], "split": "at", "meta": { "type": "chapter" } },
|
|
544
|
+
{ "lineStartsAfter": ["{{raqms:num}}\\s*{{dash}}\\s*"], "split": "at", "meta": { "type": "hadith" } }
|
|
545
|
+
]
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
Example C: narrator/rijāl entries with rumuz (codes) + colon
|
|
549
|
+
Input pages:
|
|
550
|
+
PAGE 257:
|
|
551
|
+
٢٩- خ سي: أحمد بن حميد...\nوكان من حفاظ الكوفة.
|
|
552
|
+
PAGE 258:
|
|
553
|
+
١٠٢- ق: تمييز ولهم شيخ آخر...\n...
|
|
554
|
+
|
|
555
|
+
Good JSON answer:
|
|
556
|
+
{
|
|
557
|
+
"rules": [
|
|
558
|
+
{
|
|
559
|
+
"lineStartsAfter": ["{{raqms:num}}\\s*{{dash}}\\s*{{rumuz}}:\\s*"],
|
|
560
|
+
"split": "at",
|
|
561
|
+
"meta": { "type": "entry" }
|
|
562
|
+
}
|
|
563
|
+
]
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
Now wait for the pages.
|
|
312
567
|
```
|
|
313
568
|
|
|
314
569
|
### Sentence-Based Splitting (Last Period Per Page)
|
package/dist/index.d.mts
CHANGED
|
@@ -360,6 +360,27 @@ type RuleConstraints = {
|
|
|
360
360
|
* - undefined: No fallback (current behavior)
|
|
361
361
|
*/
|
|
362
362
|
fallback?: 'page';
|
|
363
|
+
/**
|
|
364
|
+
* Page-start guard: only allow this rule to match at the START of a page if the
|
|
365
|
+
* previous page's last non-whitespace character matches this pattern.
|
|
366
|
+
*
|
|
367
|
+
* This is useful for avoiding false positives caused purely by page wrap.
|
|
368
|
+
*
|
|
369
|
+
* Example use-case:
|
|
370
|
+
* - Split on `{{naql}}` at line starts (e.g. "أخبرنا ...")
|
|
371
|
+
* - BUT if a new page starts with "أخبرنا ..." and the previous page did NOT
|
|
372
|
+
* end with sentence-ending punctuation, treat it as a continuation and do not split.
|
|
373
|
+
*
|
|
374
|
+
* Notes:
|
|
375
|
+
* - This guard applies ONLY at page starts, not mid-page line starts.
|
|
376
|
+
* - This is a template pattern (tokens allowed). It is checked against the LAST
|
|
377
|
+
* non-whitespace character of the previous page's content.
|
|
378
|
+
*
|
|
379
|
+
* @example
|
|
380
|
+
* // Allow split at page start only if previous page ends with sentence punctuation
|
|
381
|
+
* { lineStartsWith: ['{{naql}}'], fuzzy: true, pageStartGuard: '{{tarqim}}' }
|
|
382
|
+
*/
|
|
383
|
+
pageStartGuard?: string;
|
|
363
384
|
};
|
|
364
385
|
/**
|
|
365
386
|
* A complete split rule combining pattern, behavior, and constraints.
|
|
@@ -720,7 +741,6 @@ type Segment = {
|
|
|
720
741
|
};
|
|
721
742
|
//#endregion
|
|
722
743
|
//#region src/segmentation/segmenter.d.ts
|
|
723
|
-
|
|
724
744
|
/**
|
|
725
745
|
* Applies breakpoints to oversized segments.
|
|
726
746
|
*
|
|
@@ -779,25 +799,6 @@ type Segment = {
|
|
|
779
799
|
*/
|
|
780
800
|
declare const segmentPages: (pages: Page[], options: SegmentationOptions) => Segment[];
|
|
781
801
|
//#endregion
|
|
782
|
-
//#region src/segmentation/textUtils.d.ts
|
|
783
|
-
/**
|
|
784
|
-
* Strip all HTML tags from content, keeping only text.
|
|
785
|
-
*
|
|
786
|
-
* @param html - HTML content
|
|
787
|
-
* @returns Plain text content
|
|
788
|
-
*/
|
|
789
|
-
declare const stripHtmlTags: (html: string) => string;
|
|
790
|
-
/**
|
|
791
|
-
* Normalizes line endings to Unix-style (`\n`).
|
|
792
|
-
*
|
|
793
|
-
* Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
|
|
794
|
-
* for consistent pattern matching across platforms.
|
|
795
|
-
*
|
|
796
|
-
* @param content - Raw content with potentially mixed line endings
|
|
797
|
-
* @returns Content with all line endings normalized to `\n`
|
|
798
|
-
*/
|
|
799
|
-
declare const normalizeLineEndings: (content: string) => string;
|
|
800
|
-
//#endregion
|
|
801
802
|
//#region src/segmentation/tokens.d.ts
|
|
802
803
|
/**
|
|
803
804
|
* Token-based template system for Arabic text pattern matching.
|
|
@@ -1039,7 +1040,90 @@ declare const getAvailableTokens: () => string[];
|
|
|
1039
1040
|
*/
|
|
1040
1041
|
declare const getTokenPattern: (tokenName: string) => string | undefined;
|
|
1041
1042
|
//#endregion
|
|
1042
|
-
//#region src/
|
|
1043
|
+
//#region src/analysis.d.ts
|
|
1044
|
+
type LineStartAnalysisOptions = {
|
|
1045
|
+
/** Return top K patterns (after filtering). Default: 20 */
|
|
1046
|
+
topK?: number;
|
|
1047
|
+
/** Only consider the first N characters of each trimmed line. Default: 60 */
|
|
1048
|
+
prefixChars?: number;
|
|
1049
|
+
/** Ignore lines shorter than this (after trimming). Default: 6 */
|
|
1050
|
+
minLineLength?: number;
|
|
1051
|
+
/** Only include patterns that appear at least this many times. Default: 3 */
|
|
1052
|
+
minCount?: number;
|
|
1053
|
+
/** Keep up to this many example lines per pattern. Default: 5 */
|
|
1054
|
+
maxExamples?: number;
|
|
1055
|
+
/**
|
|
1056
|
+
* If true, include a literal first word when no token match is found at the start.
|
|
1057
|
+
* Default: true
|
|
1058
|
+
*/
|
|
1059
|
+
includeFirstWordFallback?: boolean;
|
|
1060
|
+
/**
|
|
1061
|
+
* If true, strip Arabic diacritics (harakat/tashkeel) for the purposes of matching tokens.
|
|
1062
|
+
* This helps patterns like `وأَخْبَرَنَا` match the `{{naql}}` token (`وأخبرنا`).
|
|
1063
|
+
*
|
|
1064
|
+
* Note: examples are still stored in their original (unstripped) form.
|
|
1065
|
+
*
|
|
1066
|
+
* Default: true
|
|
1067
|
+
*/
|
|
1068
|
+
normalizeArabicDiacritics?: boolean;
|
|
1069
|
+
/**
|
|
1070
|
+
* How to sort patterns before applying `topK`.
|
|
1071
|
+
*
|
|
1072
|
+
* - `specificity` (default): prioritize more structured prefixes first (tokenCount, then literalLen), then count.
|
|
1073
|
+
* - `count`: prioritize highest-frequency patterns first, then specificity.
|
|
1074
|
+
*/
|
|
1075
|
+
sortBy?: 'specificity' | 'count';
|
|
1076
|
+
/**
|
|
1077
|
+
* Optional filter to restrict which lines are analyzed.
|
|
1078
|
+
*
|
|
1079
|
+
* The `line` argument is the trimmed + whitespace-collapsed version of the line.
|
|
1080
|
+
* Return `true` to include it, `false` to skip it.
|
|
1081
|
+
*
|
|
1082
|
+
* @example
|
|
1083
|
+
* // Only analyze markdown H2 headings
|
|
1084
|
+
* { lineFilter: (line) => line.startsWith('## ') }
|
|
1085
|
+
*/
|
|
1086
|
+
lineFilter?: (line: string, pageId: number) => boolean;
|
|
1087
|
+
/**
|
|
1088
|
+
* Optional list of prefix matchers to consume before tokenization.
|
|
1089
|
+
*
|
|
1090
|
+
* This is for "syntactic" prefixes that are common at line start but are not
|
|
1091
|
+
* meaningful as tokens by themselves (e.g. markdown headings like `##`).
|
|
1092
|
+
*
|
|
1093
|
+
* Each matcher is applied at the current position. If it matches, the matched
|
|
1094
|
+
* text is appended (escaped) to the signature and the scanner advances.
|
|
1095
|
+
*
|
|
1096
|
+
* @example
|
|
1097
|
+
* // Support markdown blockquotes and headings
|
|
1098
|
+
* { prefixMatchers: [/^>+/u, /^#+/u] }
|
|
1099
|
+
*/
|
|
1100
|
+
prefixMatchers?: RegExp[];
|
|
1101
|
+
/**
|
|
1102
|
+
* How to represent whitespace in returned `pattern` signatures.
|
|
1103
|
+
*
|
|
1104
|
+
* - `regex` (default): use `\\s*` placeholders between tokens (useful if you paste patterns into regex-ish templates).
|
|
1105
|
+
* - `space`: use literal single spaces (`' '`) between tokens (safer if you don't want `\\s` to match newlines when reused as regex).
|
|
1106
|
+
*/
|
|
1107
|
+
whitespace?: 'regex' | 'space';
|
|
1108
|
+
};
|
|
1109
|
+
type LineStartPatternExample = {
|
|
1110
|
+
line: string;
|
|
1111
|
+
pageId: number;
|
|
1112
|
+
};
|
|
1113
|
+
type CommonLineStartPattern = {
|
|
1114
|
+
pattern: string;
|
|
1115
|
+
count: number;
|
|
1116
|
+
examples: LineStartPatternExample[];
|
|
1117
|
+
};
|
|
1118
|
+
/**
|
|
1119
|
+
* Analyze pages and return the most common line-start patterns (top K).
|
|
1120
|
+
*
|
|
1121
|
+
* This is a pure algorithmic heuristic: it tokenizes common prefixes into a stable
|
|
1122
|
+
* template-ish string using the library tokens (e.g., `{{bab}}`, `{{raqms}}`, `{{rumuz}}`).
|
|
1123
|
+
*/
|
|
1124
|
+
declare const analyzeCommonLineStarts: (pages: Page[], options?: LineStartAnalysisOptions) => CommonLineStartPattern[];
|
|
1125
|
+
//#endregion
|
|
1126
|
+
//#region src/detection.d.ts
|
|
1043
1127
|
/**
|
|
1044
1128
|
* Pattern detection utilities for recognizing template tokens in Arabic text.
|
|
1045
1129
|
* Used to auto-detect patterns from user-highlighted text in the segmentation dialog.
|
|
@@ -1114,5 +1198,5 @@ declare const analyzeTextForRule: (text: string) => {
|
|
|
1114
1198
|
detected: DetectedPattern[];
|
|
1115
1199
|
} | null;
|
|
1116
1200
|
//#endregion
|
|
1117
|
-
export { type Breakpoint, type BreakpointRule, type DetectedPattern, type ExpandResult, type Logger, type Page, type PageRange, type Segment, type SegmentationOptions, type SplitRule, TOKEN_PATTERNS, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive,
|
|
1201
|
+
export { type Breakpoint, type BreakpointRule, type CommonLineStartPattern, type DetectedPattern, type ExpandResult, type LineStartAnalysisOptions, type LineStartPatternExample, type Logger, type Page, type PageRange, type Segment, type SegmentationOptions, type SplitRule, TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, segmentPages, suggestPatternConfig, templateToRegex };
|
|
1118
1202
|
//# sourceMappingURL=index.d.mts.map
|
package/dist/index.d.mts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/segmenter.ts","../src/segmentation/
|
|
1
|
+
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/segmenter.ts","../src/segmentation/tokens.ts","../src/analysis.ts","../src/detection.ts"],"sourcesContent":[],"mappings":";;AAkEA;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EY,cD/bC,WC+bqB,EAAA,CAAA,CAAA,EAAA,MAAc,EAAA,GAAA,MAAA;AA8BhD;AAiDA;;;;;AA+HA;;;;ACjPA;;;;;;;;AC1ZA;AAgQA;AA2CA;AAWA;AA2DA;AAyHA;AAuBA;AAqBA;AAgBA;;;;ACtmBA;AAkEA;AAEA;AAuRA;;AAEa,cJhMA,wBIgMA,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;AJ/Rb;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA,KApXK,YAAA,GAoXW;EAqCJ;EA0EA,KAAA,EAAA,MAAU;AA8BtB,CAAA;AAiDA;;;;;AA+HA;;;;ACjPA;;;;;;;;AC1ZA;AAgQA;AA2CA;AAWA;AA2DA;AAyHA;AAuBA,KF3gBK,eAAA,GEkhBJ;EAcY;EAgBA,QAAA,EAAA,MAAA;;;;ACtmBb;AAkEA;AAEA;AAuRA;;;;;;;;ACnVA;AA+EA;AAgEA;AAuBA;AAiCA;;;;;;;;KJ3HK,qBAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAiCA,sBAAA;;;;;;;;;;;;;;;;;;;;;;;KAwBA,mBAAA;;;;;;;;;;;;;;KAeA,WAAA,GACC,eACA,kBACA,wBACA,yBACA;;;;;;;KAYD,aAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA+EO,SAAA;;;;;;;KAYP,eAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAyCS;;;;;;;;;;;;SAaH;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA8DC,SAAA,GAAY,cAAc,gBAAgB;;;;;;;;;;;;;KAkB1C,IAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCA,cAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAqCE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCF,UAAA,YAAsB;;;;;;;;;;;;;;;;;;;;;;;;;UA8BjB,MAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAiDL,mBAAA;;;;;;;;UAQA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBA8CM;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;WAwDL;;;;;;;;;;;;;;;;KAiBD,OAAA;;;;;;;;;;;;;;;;;;;;;;;;;;SA6BD;;;;;;AA1VX;AAqCA;AA0EA;AA8BA;AAiDA;;;;;AA+HA;;;;ACjPA;;;;;;;;AC1ZA;AAgQA;AA2CA;AAWA;AA2DA;AAyHA;AAuBA;AAqBA;AAgBA;;;;ACtmBA;AAkEA;AAEA;AAuRA;;;;;;;;ACnVA;AA+EA;AAgEA;AAuBA;AAiCA;;;;;;cH2Qa,sBAAuB,iBAAiB,wBAAsB;;;;AF5Z3E;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAiDA;;;;;AA+HA;;;;ACjPA;;;;;;;;AC1ZA;AAgQA;AA2CA;AAWA;AA2DA;AAyHA;AAuBA;AAqBA;AAgBA;;;;ACtmBA;AAkEA;AAEA;AAuRA;AACW,cD5RE,sBC4RF,EAAA,CAAA,OAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;;;;ACpVX;AA+EA;AAgEA;AAuBA;AAiCA;;;;;;;;;;;;;;;cFiHa,gBAAgB;;;;;;;;;;;;;;;;cA2ChB;;;;;;;KAWD,YAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;cA2DC,mHAIV;;;;;;;;;;;;;;;;;;;;cAqHU;;;;;;;;;;;;;;;;;;;;;;cAuBA,uCAAmC;;;;;;;;;;;;;cAqBnC;;;;;;;;;;;;;;;cAgBA;;;AHxiBA,KI9DD,wBAAA,GJ8D8E;EA+F7E;;;;ECnIR;EA4BA,aAAA,CAAA,EAAA,MAAe;EA8Bf;EAiCA,QAAA,CAAA,EAAA,MAAA;EAwBA;EAeA,WAAA,CAAA,EAAW,MAAA;EACV;;;;EAIA,wBAAA,CAAA,EAAA,OAAA;EAAmB;AAAA;AA2FzB;AAAkD;AAgIlD;;;;EAAqE,yBAAA,CAAA,EAAA,OAAA;EAkBzD;AAqCZ;AA0EA;AA8BA;AAiDA;;EAsDkB,MAAA,CAAA,EAAA,aAAA,GAAA,OAAA;EAwDL;;AAiBb;;;;ACjPA;;;;EAAkF,UAAA,CAAA,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,MAAA,EAAA,MAAA,EAAA,GAAA,OAAA;;;;AC1ZlF;AAgQA;AA2CA;AAWA;AA2DA;AAyHA;AAuBA;AAqBA;AAgBA;;mBC9iBqB;;AAxDrB;AAkEA;AAEA;AAuRA;;EAEa,UAAA,CAAA,EAAA,OAAA,GAAA,OAAA;CACV;AAAsB,KA5Rb,uBAAA,GA4Ra;;;;ACtVb,KD4DA,sBAAA,GC5De;EA+Ed,OAAA,EAAA,MAAA;EAgEA,KAAA,EAAA,MAAA;EAuBA,QAAA,EDvGC,uBCgIb,EAxBa;AAgCd,CAAA;;;;;;;cD4Ia,iCACF,kBACE,6BACV;;;;AJhSH;AA+FA;;;;;ACnIiB;AA4BG;AA+Df,KI7GO,eAAA,GJ6Ge;EAwBtB;EAeA,KAAA,EAAA,MAAA;EACC;EACA,KAAA,EAAA,MAAA;EACA;EACA,KAAA,EAAA,MAAA;EACA;EAAmB,QAAA,EAAA,MAAA;AAAA,CAAA;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAiDA;;;;AA8GmB,cInmBN,mBJmmBM,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GInmB6B,eJmmB7B,EAAA;AAiBnB;;;;ACjPA;;;;;;;;AC1ZA;AAgQa,cEzKA,wBFyKsB,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,QAAA,EEzK8B,eFyK9B,EAAA,EAAA,GAAA,MAAA;AA2CnC;AAWA;AA2DA;AAyHA;AAuBA;AAqBA;AAgBa,cExbA,oBFwbsF,EAAA,CAAA,QAAA,EEvbrF,eFubqF,EAAA,EAAA,GAAA;;;;ACtmBnG,CAAA;AAkEA;AAEA;AAuRA;;;;AAGyB,cC/IZ,kBD+IY,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA;;;;ECtVb,QAAA,CAAA,EAAA,MAAA;EA+EC,QAAA,EA+HC,eA9Eb,EAAA;AAeD,CAAA,GAAa,IAAA"}
|